From 9ca8dbcc65cfc63d6f5ef3312a33184e1d726e00 Mon Sep 17 00:00:00 2001 From: Yunhong Jiang Date: Tue, 4 Aug 2015 12:17:53 -0700 Subject: Add the rt linux 4.1.3-rt3 as base Import the rt linux 4.1.3-rt3 as OPNFV kvm base. It's from git://git.kernel.org/pub/scm/linux/kernel/git/rt/linux-rt-devel.git linux-4.1.y-rt and the base is: commit 0917f823c59692d751951bf5ea699a2d1e2f26a2 Author: Sebastian Andrzej Siewior Date: Sat Jul 25 12:13:34 2015 +0200 Prepare v4.1.3-rt3 Signed-off-by: Sebastian Andrzej Siewior We lose all the git history this way and it's not good. We should apply another opnfv project repo in future. Change-Id: I87543d81c9df70d99c5001fbdf646b202c19f423 Signed-off-by: Yunhong Jiang --- kernel/fs/xfs/Kconfig | 97 + kernel/fs/xfs/Makefile | 124 + kernel/fs/xfs/kmem.c | 127 + kernel/fs/xfs/kmem.h | 125 + kernel/fs/xfs/libxfs/xfs_alloc.c | 2639 +++++++++++++ kernel/fs/xfs/libxfs/xfs_alloc.h | 237 ++ kernel/fs/xfs/libxfs/xfs_alloc_btree.c | 503 +++ kernel/fs/xfs/libxfs/xfs_alloc_btree.h | 65 + kernel/fs/xfs/libxfs/xfs_attr.c | 1456 +++++++ kernel/fs/xfs/libxfs/xfs_attr_leaf.c | 2773 ++++++++++++++ kernel/fs/xfs/libxfs/xfs_attr_leaf.h | 110 + kernel/fs/xfs/libxfs/xfs_attr_remote.c | 626 +++ kernel/fs/xfs/libxfs/xfs_attr_remote.h | 27 + kernel/fs/xfs/libxfs/xfs_attr_sf.h | 70 + kernel/fs/xfs/libxfs/xfs_bit.h | 87 + kernel/fs/xfs/libxfs/xfs_bmap.c | 5945 +++++++++++++++++++++++++++++ kernel/fs/xfs/libxfs/xfs_bmap.h | 225 ++ kernel/fs/xfs/libxfs/xfs_bmap_btree.c | 883 +++++ kernel/fs/xfs/libxfs/xfs_bmap_btree.h | 143 + kernel/fs/xfs/libxfs/xfs_btree.c | 4067 ++++++++++++++++++++ kernel/fs/xfs/libxfs/xfs_btree.h | 468 +++ kernel/fs/xfs/libxfs/xfs_cksum.h | 63 + kernel/fs/xfs/libxfs/xfs_da_btree.c | 2660 +++++++++++++ kernel/fs/xfs/libxfs/xfs_da_btree.h | 221 ++ kernel/fs/xfs/libxfs/xfs_da_format.c | 908 +++++ kernel/fs/xfs/libxfs/xfs_da_format.h | 873 +++++ kernel/fs/xfs/libxfs/xfs_dir2.c | 731 ++++ kernel/fs/xfs/libxfs/xfs_dir2.h | 320 ++ kernel/fs/xfs/libxfs/xfs_dir2_block.c | 1254 ++++++ kernel/fs/xfs/libxfs/xfs_dir2_data.c | 1049 +++++ kernel/fs/xfs/libxfs/xfs_dir2_leaf.c | 1819 +++++++++ kernel/fs/xfs/libxfs/xfs_dir2_node.c | 2270 +++++++++++ kernel/fs/xfs/libxfs/xfs_dir2_priv.h | 134 + kernel/fs/xfs/libxfs/xfs_dir2_sf.c | 1142 ++++++ kernel/fs/xfs/libxfs/xfs_dquot_buf.c | 288 ++ kernel/fs/xfs/libxfs/xfs_format.h | 1461 +++++++ kernel/fs/xfs/libxfs/xfs_fs.h | 576 +++ kernel/fs/xfs/libxfs/xfs_ialloc.c | 2202 +++++++++++ kernel/fs/xfs/libxfs/xfs_ialloc.h | 167 + kernel/fs/xfs/libxfs/xfs_ialloc_btree.c | 420 ++ kernel/fs/xfs/libxfs/xfs_ialloc_btree.h | 65 + kernel/fs/xfs/libxfs/xfs_inode_buf.c | 476 +++ kernel/fs/xfs/libxfs/xfs_inode_buf.h | 50 + kernel/fs/xfs/libxfs/xfs_inode_fork.c | 1902 +++++++++ kernel/fs/xfs/libxfs/xfs_inode_fork.h | 171 + kernel/fs/xfs/libxfs/xfs_log_format.h | 679 ++++ kernel/fs/xfs/libxfs/xfs_log_recover.h | 66 + kernel/fs/xfs/libxfs/xfs_log_rlimit.c | 148 + kernel/fs/xfs/libxfs/xfs_quota_defs.h | 159 + kernel/fs/xfs/libxfs/xfs_rtbitmap.c | 991 +++++ kernel/fs/xfs/libxfs/xfs_sb.c | 803 ++++ kernel/fs/xfs/libxfs/xfs_sb.h | 38 + kernel/fs/xfs/libxfs/xfs_shared.h | 243 ++ kernel/fs/xfs/libxfs/xfs_symlink_remote.c | 201 + kernel/fs/xfs/libxfs/xfs_trans_resv.c | 878 +++++ kernel/fs/xfs/libxfs/xfs_trans_resv.h | 116 + kernel/fs/xfs/libxfs/xfs_trans_space.h | 92 + kernel/fs/xfs/libxfs/xfs_types.h | 137 + kernel/fs/xfs/mrlock.h | 90 + kernel/fs/xfs/uuid.c | 63 + kernel/fs/xfs/uuid.h | 35 + kernel/fs/xfs/xfs.h | 34 + kernel/fs/xfs/xfs_acl.c | 304 ++ kernel/fs/xfs/xfs_acl.h | 39 + kernel/fs/xfs/xfs_aops.c | 1931 ++++++++++ kernel/fs/xfs/xfs_aops.h | 60 + kernel/fs/xfs/xfs_attr.h | 154 + kernel/fs/xfs/xfs_attr_inactive.c | 465 +++ kernel/fs/xfs/xfs_attr_list.c | 653 ++++ kernel/fs/xfs/xfs_bit.c | 118 + kernel/fs/xfs/xfs_bmap_util.c | 1920 ++++++++++ kernel/fs/xfs/xfs_bmap_util.h | 79 + kernel/fs/xfs/xfs_buf.c | 1901 +++++++++ kernel/fs/xfs/xfs_buf.h | 393 ++ kernel/fs/xfs/xfs_buf_item.c | 1155 ++++++ kernel/fs/xfs/xfs_buf_item.h | 76 + kernel/fs/xfs/xfs_dir2_readdir.c | 681 ++++ kernel/fs/xfs/xfs_discard.c | 239 ++ kernel/fs/xfs/xfs_discard.h | 10 + kernel/fs/xfs/xfs_dquot.c | 1104 ++++++ kernel/fs/xfs/xfs_dquot.h | 188 + kernel/fs/xfs/xfs_dquot_item.c | 443 +++ kernel/fs/xfs/xfs_dquot_item.h | 47 + kernel/fs/xfs/xfs_error.c | 180 + kernel/fs/xfs/xfs_error.h | 153 + kernel/fs/xfs/xfs_export.c | 254 ++ kernel/fs/xfs/xfs_export.h | 72 + kernel/fs/xfs/xfs_extent_busy.c | 604 +++ kernel/fs/xfs/xfs_extent_busy.h | 73 + kernel/fs/xfs/xfs_extfree_item.c | 507 +++ kernel/fs/xfs/xfs_extfree_item.h | 81 + kernel/fs/xfs/xfs_file.c | 1534 ++++++++ kernel/fs/xfs/xfs_filestream.c | 431 +++ kernel/fs/xfs/xfs_filestream.h | 40 + kernel/fs/xfs/xfs_fsops.c | 850 +++++ kernel/fs/xfs/xfs_fsops.h | 30 + kernel/fs/xfs/xfs_globals.c | 49 + kernel/fs/xfs/xfs_icache.c | 1418 +++++++ kernel/fs/xfs/xfs_icache.h | 115 + kernel/fs/xfs/xfs_icreate_item.c | 188 + kernel/fs/xfs/xfs_icreate_item.h | 34 + kernel/fs/xfs/xfs_inode.c | 3606 +++++++++++++++++ kernel/fs/xfs/xfs_inode.h | 456 +++ kernel/fs/xfs/xfs_inode_item.c | 789 ++++ kernel/fs/xfs/xfs_inode_item.h | 54 + kernel/fs/xfs/xfs_ioctl.c | 1806 +++++++++ kernel/fs/xfs/xfs_ioctl.h | 95 + kernel/fs/xfs/xfs_ioctl32.c | 680 ++++ kernel/fs/xfs/xfs_ioctl32.h | 238 ++ kernel/fs/xfs/xfs_iomap.c | 920 +++++ kernel/fs/xfs/xfs_iomap.h | 32 + kernel/fs/xfs/xfs_iops.c | 1305 +++++++ kernel/fs/xfs/xfs_iops.h | 38 + kernel/fs/xfs/xfs_itable.c | 652 ++++ kernel/fs/xfs/xfs_itable.h | 99 + kernel/fs/xfs/xfs_linux.h | 384 ++ kernel/fs/xfs/xfs_log.c | 4007 +++++++++++++++++++ kernel/fs/xfs/xfs_log.h | 193 + kernel/fs/xfs/xfs_log_cil.c | 998 +++++ kernel/fs/xfs/xfs_log_priv.h | 561 +++ kernel/fs/xfs/xfs_log_recover.c | 4651 ++++++++++++++++++++++ kernel/fs/xfs/xfs_message.c | 113 + kernel/fs/xfs/xfs_message.h | 64 + kernel/fs/xfs/xfs_mount.c | 1283 +++++++ kernel/fs/xfs/xfs_mount.h | 335 ++ kernel/fs/xfs/xfs_mru_cache.c | 552 +++ kernel/fs/xfs/xfs_mru_cache.h | 46 + kernel/fs/xfs/xfs_pnfs.c | 329 ++ kernel/fs/xfs/xfs_pnfs.h | 19 + kernel/fs/xfs/xfs_qm.c | 1939 ++++++++++ kernel/fs/xfs/xfs_qm.h | 174 + kernel/fs/xfs/xfs_qm_bhv.c | 149 + kernel/fs/xfs/xfs_qm_syscalls.c | 770 ++++ kernel/fs/xfs/xfs_quota.h | 155 + kernel/fs/xfs/xfs_quotaops.c | 271 ++ kernel/fs/xfs/xfs_rtalloc.c | 1302 +++++++ kernel/fs/xfs/xfs_rtalloc.h | 145 + kernel/fs/xfs/xfs_stats.c | 198 + kernel/fs/xfs/xfs_stats.h | 249 ++ kernel/fs/xfs/xfs_super.c | 1889 +++++++++ kernel/fs/xfs/xfs_super.h | 79 + kernel/fs/xfs/xfs_symlink.c | 608 +++ kernel/fs/xfs/xfs_symlink.h | 27 + kernel/fs/xfs/xfs_sysctl.c | 243 ++ kernel/fs/xfs/xfs_sysctl.h | 108 + kernel/fs/xfs/xfs_sysfs.c | 239 ++ kernel/fs/xfs/xfs_sysfs.h | 60 + kernel/fs/xfs/xfs_trace.c | 54 + kernel/fs/xfs/xfs_trace.h | 2083 ++++++++++ kernel/fs/xfs/xfs_trans.c | 1105 ++++++ kernel/fs/xfs/xfs_trans.h | 245 ++ kernel/fs/xfs/xfs_trans_ail.c | 794 ++++ kernel/fs/xfs/xfs_trans_buf.c | 802 ++++ kernel/fs/xfs/xfs_trans_dquot.c | 887 +++++ kernel/fs/xfs/xfs_trans_extfree.c | 133 + kernel/fs/xfs/xfs_trans_inode.c | 135 + kernel/fs/xfs/xfs_trans_priv.h | 161 + kernel/fs/xfs/xfs_xattr.c | 243 ++ 158 files changed, 104917 insertions(+) create mode 100644 kernel/fs/xfs/Kconfig create mode 100644 kernel/fs/xfs/Makefile create mode 100644 kernel/fs/xfs/kmem.c create mode 100644 kernel/fs/xfs/kmem.h create mode 100644 kernel/fs/xfs/libxfs/xfs_alloc.c create mode 100644 kernel/fs/xfs/libxfs/xfs_alloc.h create mode 100644 kernel/fs/xfs/libxfs/xfs_alloc_btree.c create mode 100644 kernel/fs/xfs/libxfs/xfs_alloc_btree.h create mode 100644 kernel/fs/xfs/libxfs/xfs_attr.c create mode 100644 kernel/fs/xfs/libxfs/xfs_attr_leaf.c create mode 100644 kernel/fs/xfs/libxfs/xfs_attr_leaf.h create mode 100644 kernel/fs/xfs/libxfs/xfs_attr_remote.c create mode 100644 kernel/fs/xfs/libxfs/xfs_attr_remote.h create mode 100644 kernel/fs/xfs/libxfs/xfs_attr_sf.h create mode 100644 kernel/fs/xfs/libxfs/xfs_bit.h create mode 100644 kernel/fs/xfs/libxfs/xfs_bmap.c create mode 100644 kernel/fs/xfs/libxfs/xfs_bmap.h create mode 100644 kernel/fs/xfs/libxfs/xfs_bmap_btree.c create mode 100644 kernel/fs/xfs/libxfs/xfs_bmap_btree.h create mode 100644 kernel/fs/xfs/libxfs/xfs_btree.c create mode 100644 kernel/fs/xfs/libxfs/xfs_btree.h create mode 100644 kernel/fs/xfs/libxfs/xfs_cksum.h create mode 100644 kernel/fs/xfs/libxfs/xfs_da_btree.c create mode 100644 kernel/fs/xfs/libxfs/xfs_da_btree.h create mode 100644 kernel/fs/xfs/libxfs/xfs_da_format.c create mode 100644 kernel/fs/xfs/libxfs/xfs_da_format.h create mode 100644 kernel/fs/xfs/libxfs/xfs_dir2.c create mode 100644 kernel/fs/xfs/libxfs/xfs_dir2.h create mode 100644 kernel/fs/xfs/libxfs/xfs_dir2_block.c create mode 100644 kernel/fs/xfs/libxfs/xfs_dir2_data.c create mode 100644 kernel/fs/xfs/libxfs/xfs_dir2_leaf.c create mode 100644 kernel/fs/xfs/libxfs/xfs_dir2_node.c create mode 100644 kernel/fs/xfs/libxfs/xfs_dir2_priv.h create mode 100644 kernel/fs/xfs/libxfs/xfs_dir2_sf.c create mode 100644 kernel/fs/xfs/libxfs/xfs_dquot_buf.c create mode 100644 kernel/fs/xfs/libxfs/xfs_format.h create mode 100644 kernel/fs/xfs/libxfs/xfs_fs.h create mode 100644 kernel/fs/xfs/libxfs/xfs_ialloc.c create mode 100644 kernel/fs/xfs/libxfs/xfs_ialloc.h create mode 100644 kernel/fs/xfs/libxfs/xfs_ialloc_btree.c create mode 100644 kernel/fs/xfs/libxfs/xfs_ialloc_btree.h create mode 100644 kernel/fs/xfs/libxfs/xfs_inode_buf.c create mode 100644 kernel/fs/xfs/libxfs/xfs_inode_buf.h create mode 100644 kernel/fs/xfs/libxfs/xfs_inode_fork.c create mode 100644 kernel/fs/xfs/libxfs/xfs_inode_fork.h create mode 100644 kernel/fs/xfs/libxfs/xfs_log_format.h create mode 100644 kernel/fs/xfs/libxfs/xfs_log_recover.h create mode 100644 kernel/fs/xfs/libxfs/xfs_log_rlimit.c create mode 100644 kernel/fs/xfs/libxfs/xfs_quota_defs.h create mode 100644 kernel/fs/xfs/libxfs/xfs_rtbitmap.c create mode 100644 kernel/fs/xfs/libxfs/xfs_sb.c create mode 100644 kernel/fs/xfs/libxfs/xfs_sb.h create mode 100644 kernel/fs/xfs/libxfs/xfs_shared.h create mode 100644 kernel/fs/xfs/libxfs/xfs_symlink_remote.c create mode 100644 kernel/fs/xfs/libxfs/xfs_trans_resv.c create mode 100644 kernel/fs/xfs/libxfs/xfs_trans_resv.h create mode 100644 kernel/fs/xfs/libxfs/xfs_trans_space.h create mode 100644 kernel/fs/xfs/libxfs/xfs_types.h create mode 100644 kernel/fs/xfs/mrlock.h create mode 100644 kernel/fs/xfs/uuid.c create mode 100644 kernel/fs/xfs/uuid.h create mode 100644 kernel/fs/xfs/xfs.h create mode 100644 kernel/fs/xfs/xfs_acl.c create mode 100644 kernel/fs/xfs/xfs_acl.h create mode 100644 kernel/fs/xfs/xfs_aops.c create mode 100644 kernel/fs/xfs/xfs_aops.h create mode 100644 kernel/fs/xfs/xfs_attr.h create mode 100644 kernel/fs/xfs/xfs_attr_inactive.c create mode 100644 kernel/fs/xfs/xfs_attr_list.c create mode 100644 kernel/fs/xfs/xfs_bit.c create mode 100644 kernel/fs/xfs/xfs_bmap_util.c create mode 100644 kernel/fs/xfs/xfs_bmap_util.h create mode 100644 kernel/fs/xfs/xfs_buf.c create mode 100644 kernel/fs/xfs/xfs_buf.h create mode 100644 kernel/fs/xfs/xfs_buf_item.c create mode 100644 kernel/fs/xfs/xfs_buf_item.h create mode 100644 kernel/fs/xfs/xfs_dir2_readdir.c create mode 100644 kernel/fs/xfs/xfs_discard.c create mode 100644 kernel/fs/xfs/xfs_discard.h create mode 100644 kernel/fs/xfs/xfs_dquot.c create mode 100644 kernel/fs/xfs/xfs_dquot.h create mode 100644 kernel/fs/xfs/xfs_dquot_item.c create mode 100644 kernel/fs/xfs/xfs_dquot_item.h create mode 100644 kernel/fs/xfs/xfs_error.c create mode 100644 kernel/fs/xfs/xfs_error.h create mode 100644 kernel/fs/xfs/xfs_export.c create mode 100644 kernel/fs/xfs/xfs_export.h create mode 100644 kernel/fs/xfs/xfs_extent_busy.c create mode 100644 kernel/fs/xfs/xfs_extent_busy.h create mode 100644 kernel/fs/xfs/xfs_extfree_item.c create mode 100644 kernel/fs/xfs/xfs_extfree_item.h create mode 100644 kernel/fs/xfs/xfs_file.c create mode 100644 kernel/fs/xfs/xfs_filestream.c create mode 100644 kernel/fs/xfs/xfs_filestream.h create mode 100644 kernel/fs/xfs/xfs_fsops.c create mode 100644 kernel/fs/xfs/xfs_fsops.h create mode 100644 kernel/fs/xfs/xfs_globals.c create mode 100644 kernel/fs/xfs/xfs_icache.c create mode 100644 kernel/fs/xfs/xfs_icache.h create mode 100644 kernel/fs/xfs/xfs_icreate_item.c create mode 100644 kernel/fs/xfs/xfs_icreate_item.h create mode 100644 kernel/fs/xfs/xfs_inode.c create mode 100644 kernel/fs/xfs/xfs_inode.h create mode 100644 kernel/fs/xfs/xfs_inode_item.c create mode 100644 kernel/fs/xfs/xfs_inode_item.h create mode 100644 kernel/fs/xfs/xfs_ioctl.c create mode 100644 kernel/fs/xfs/xfs_ioctl.h create mode 100644 kernel/fs/xfs/xfs_ioctl32.c create mode 100644 kernel/fs/xfs/xfs_ioctl32.h create mode 100644 kernel/fs/xfs/xfs_iomap.c create mode 100644 kernel/fs/xfs/xfs_iomap.h create mode 100644 kernel/fs/xfs/xfs_iops.c create mode 100644 kernel/fs/xfs/xfs_iops.h create mode 100644 kernel/fs/xfs/xfs_itable.c create mode 100644 kernel/fs/xfs/xfs_itable.h create mode 100644 kernel/fs/xfs/xfs_linux.h create mode 100644 kernel/fs/xfs/xfs_log.c create mode 100644 kernel/fs/xfs/xfs_log.h create mode 100644 kernel/fs/xfs/xfs_log_cil.c create mode 100644 kernel/fs/xfs/xfs_log_priv.h create mode 100644 kernel/fs/xfs/xfs_log_recover.c create mode 100644 kernel/fs/xfs/xfs_message.c create mode 100644 kernel/fs/xfs/xfs_message.h create mode 100644 kernel/fs/xfs/xfs_mount.c create mode 100644 kernel/fs/xfs/xfs_mount.h create mode 100644 kernel/fs/xfs/xfs_mru_cache.c create mode 100644 kernel/fs/xfs/xfs_mru_cache.h create mode 100644 kernel/fs/xfs/xfs_pnfs.c create mode 100644 kernel/fs/xfs/xfs_pnfs.h create mode 100644 kernel/fs/xfs/xfs_qm.c create mode 100644 kernel/fs/xfs/xfs_qm.h create mode 100644 kernel/fs/xfs/xfs_qm_bhv.c create mode 100644 kernel/fs/xfs/xfs_qm_syscalls.c create mode 100644 kernel/fs/xfs/xfs_quota.h create mode 100644 kernel/fs/xfs/xfs_quotaops.c create mode 100644 kernel/fs/xfs/xfs_rtalloc.c create mode 100644 kernel/fs/xfs/xfs_rtalloc.h create mode 100644 kernel/fs/xfs/xfs_stats.c create mode 100644 kernel/fs/xfs/xfs_stats.h create mode 100644 kernel/fs/xfs/xfs_super.c create mode 100644 kernel/fs/xfs/xfs_super.h create mode 100644 kernel/fs/xfs/xfs_symlink.c create mode 100644 kernel/fs/xfs/xfs_symlink.h create mode 100644 kernel/fs/xfs/xfs_sysctl.c create mode 100644 kernel/fs/xfs/xfs_sysctl.h create mode 100644 kernel/fs/xfs/xfs_sysfs.c create mode 100644 kernel/fs/xfs/xfs_sysfs.h create mode 100644 kernel/fs/xfs/xfs_trace.c create mode 100644 kernel/fs/xfs/xfs_trace.h create mode 100644 kernel/fs/xfs/xfs_trans.c create mode 100644 kernel/fs/xfs/xfs_trans.h create mode 100644 kernel/fs/xfs/xfs_trans_ail.c create mode 100644 kernel/fs/xfs/xfs_trans_buf.c create mode 100644 kernel/fs/xfs/xfs_trans_dquot.c create mode 100644 kernel/fs/xfs/xfs_trans_extfree.c create mode 100644 kernel/fs/xfs/xfs_trans_inode.c create mode 100644 kernel/fs/xfs/xfs_trans_priv.h create mode 100644 kernel/fs/xfs/xfs_xattr.c (limited to 'kernel/fs/xfs') diff --git a/kernel/fs/xfs/Kconfig b/kernel/fs/xfs/Kconfig new file mode 100644 index 000000000..5d47b4df6 --- /dev/null +++ b/kernel/fs/xfs/Kconfig @@ -0,0 +1,97 @@ +config XFS_FS + tristate "XFS filesystem support" + depends on BLOCK + depends on (64BIT || LBDAF) + select EXPORTFS + select LIBCRC32C + help + XFS is a high performance journaling filesystem which originated + on the SGI IRIX platform. It is completely multi-threaded, can + support large files and large filesystems, extended attributes, + variable block sizes, is extent based, and makes extensive use of + Btrees (directories, extents, free space) to aid both performance + and scalability. + + Refer to the documentation at + for complete details. This implementation is on-disk compatible + with the IRIX version of XFS. + + To compile this file system support as a module, choose M here: the + module will be called xfs. Be aware, however, that if the file + system of your root partition is compiled as a module, you'll need + to use an initial ramdisk (initrd) to boot. + +config XFS_QUOTA + bool "XFS Quota support" + depends on XFS_FS + select QUOTACTL + help + If you say Y here, you will be able to set limits for disk usage on + a per user and/or a per group basis under XFS. XFS considers quota + information as filesystem metadata and uses journaling to provide a + higher level guarantee of consistency. The on-disk data format for + quota is also compatible with the IRIX version of XFS, allowing a + filesystem to be migrated between Linux and IRIX without any need + for conversion. + + If unsure, say N. More comprehensive documentation can be found in + README.quota in the xfsprogs package. XFS quota can be used either + with or without the generic quota support enabled (CONFIG_QUOTA) - + they are completely independent subsystems. + +config XFS_POSIX_ACL + bool "XFS POSIX ACL support" + depends on XFS_FS + select FS_POSIX_ACL + help + POSIX Access Control Lists (ACLs) support permissions for users and + groups beyond the owner/group/world scheme. + + To learn more about Access Control Lists, visit the POSIX ACLs for + Linux website . + + If you don't know what Access Control Lists are, say N. + +config XFS_RT + bool "XFS Realtime subvolume support" + depends on XFS_FS + help + If you say Y here you will be able to mount and use XFS filesystems + which contain a realtime subvolume. The realtime subvolume is a + separate area of disk space where only file data is stored. It was + originally designed to provide deterministic data rates suitable + for media streaming applications, but is also useful as a generic + mechanism for ensuring data and metadata/log I/Os are completely + separated. Regular file I/Os are isolated to a separate device + from all other requests, and this can be done quite transparently + to applications via the inherit-realtime directory inode flag. + + See the xfs man page in section 5 for additional information. + + If unsure, say N. + +config XFS_WARN + bool "XFS Verbose Warnings" + depends on XFS_FS && !XFS_DEBUG + help + Say Y here to get an XFS build with many additional warnings. + It converts ASSERT checks to WARN, so will log any out-of-bounds + conditions that occur that would otherwise be missed. It is much + lighter weight than XFS_DEBUG and does not modify algorithms and will + not cause the kernel to panic on non-fatal errors. + + However, similar to XFS_DEBUG, it is only advisable to use this if you + are debugging a particular problem. + +config XFS_DEBUG + bool "XFS Debugging support" + depends on XFS_FS + help + Say Y here to get an XFS build with many debugging features, + including ASSERT checks, function wrappers around macros, + and extra sanity-checking functions in various code paths. + + Note that the resulting code will be HUGE and SLOW, and probably + not useful unless you are debugging a particular problem. + + Say N unless you are an XFS developer, or you play one on TV. diff --git a/kernel/fs/xfs/Makefile b/kernel/fs/xfs/Makefile new file mode 100644 index 000000000..df6828570 --- /dev/null +++ b/kernel/fs/xfs/Makefile @@ -0,0 +1,124 @@ +# +# Copyright (c) 2000-2005 Silicon Graphics, Inc. +# All Rights Reserved. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +# + +ccflags-y += -I$(src) # needed for trace events +ccflags-y += -I$(src)/libxfs + +ccflags-$(CONFIG_XFS_DEBUG) += -g + +obj-$(CONFIG_XFS_FS) += xfs.o + +# this one should be compiled first, as the tracing macros can easily blow up +xfs-y += xfs_trace.o + +# build the libxfs code first +xfs-y += $(addprefix libxfs/, \ + xfs_alloc.o \ + xfs_alloc_btree.o \ + xfs_attr.o \ + xfs_attr_leaf.o \ + xfs_attr_remote.o \ + xfs_bmap.o \ + xfs_bmap_btree.o \ + xfs_btree.o \ + xfs_da_btree.o \ + xfs_da_format.o \ + xfs_dir2.o \ + xfs_dir2_block.o \ + xfs_dir2_data.o \ + xfs_dir2_leaf.o \ + xfs_dir2_node.o \ + xfs_dir2_sf.o \ + xfs_dquot_buf.o \ + xfs_ialloc.o \ + xfs_ialloc_btree.o \ + xfs_inode_fork.o \ + xfs_inode_buf.o \ + xfs_log_rlimit.o \ + xfs_sb.o \ + xfs_symlink_remote.o \ + xfs_trans_resv.o \ + ) +# xfs_rtbitmap is shared with libxfs +xfs-$(CONFIG_XFS_RT) += $(addprefix libxfs/, \ + xfs_rtbitmap.o \ + ) + +# highlevel code +xfs-y += xfs_aops.o \ + xfs_attr_inactive.o \ + xfs_attr_list.o \ + xfs_bit.o \ + xfs_bmap_util.o \ + xfs_buf.o \ + xfs_dir2_readdir.o \ + xfs_discard.o \ + xfs_error.o \ + xfs_export.o \ + xfs_extent_busy.o \ + xfs_file.o \ + xfs_filestream.o \ + xfs_fsops.o \ + xfs_globals.o \ + xfs_icache.o \ + xfs_ioctl.o \ + xfs_iomap.o \ + xfs_iops.o \ + xfs_inode.o \ + xfs_itable.o \ + xfs_message.o \ + xfs_mount.o \ + xfs_mru_cache.o \ + xfs_super.o \ + xfs_symlink.o \ + xfs_sysfs.o \ + xfs_trans.o \ + xfs_xattr.o \ + kmem.o \ + uuid.o + +# low-level transaction/log code +xfs-y += xfs_log.o \ + xfs_log_cil.o \ + xfs_buf_item.o \ + xfs_extfree_item.o \ + xfs_icreate_item.o \ + xfs_inode_item.o \ + xfs_log_recover.o \ + xfs_trans_ail.o \ + xfs_trans_buf.o \ + xfs_trans_extfree.o \ + xfs_trans_inode.o \ + +# optional features +xfs-$(CONFIG_XFS_QUOTA) += xfs_dquot.o \ + xfs_dquot_item.o \ + xfs_trans_dquot.o \ + xfs_qm_syscalls.o \ + xfs_qm_bhv.o \ + xfs_qm.o \ + xfs_quotaops.o + +# xfs_rtbitmap is shared with libxfs +xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o + +xfs-$(CONFIG_XFS_POSIX_ACL) += xfs_acl.o +xfs-$(CONFIG_PROC_FS) += xfs_stats.o +xfs-$(CONFIG_SYSCTL) += xfs_sysctl.o +xfs-$(CONFIG_COMPAT) += xfs_ioctl32.o +xfs-$(CONFIG_NFSD_PNFS) += xfs_pnfs.o diff --git a/kernel/fs/xfs/kmem.c b/kernel/fs/xfs/kmem.c new file mode 100644 index 000000000..a7a3a63bb --- /dev/null +++ b/kernel/fs/xfs/kmem.c @@ -0,0 +1,127 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include +#include +#include +#include +#include +#include +#include "kmem.h" +#include "xfs_message.h" + +/* + * Greedy allocation. May fail and may return vmalloced memory. + */ +void * +kmem_zalloc_greedy(size_t *size, size_t minsize, size_t maxsize) +{ + void *ptr; + size_t kmsize = maxsize; + + while (!(ptr = vzalloc(kmsize))) { + if ((kmsize >>= 1) <= minsize) + kmsize = minsize; + } + if (ptr) + *size = kmsize; + return ptr; +} + +void * +kmem_alloc(size_t size, xfs_km_flags_t flags) +{ + int retries = 0; + gfp_t lflags = kmem_flags_convert(flags); + void *ptr; + + do { + ptr = kmalloc(size, lflags); + if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP))) + return ptr; + if (!(++retries % 100)) + xfs_err(NULL, + "possible memory allocation deadlock in %s (mode:0x%x)", + __func__, lflags); + congestion_wait(BLK_RW_ASYNC, HZ/50); + } while (1); +} + +void * +kmem_zalloc_large(size_t size, xfs_km_flags_t flags) +{ + unsigned noio_flag = 0; + void *ptr; + gfp_t lflags; + + ptr = kmem_zalloc(size, flags | KM_MAYFAIL); + if (ptr) + return ptr; + + /* + * __vmalloc() will allocate data pages and auxillary structures (e.g. + * pagetables) with GFP_KERNEL, yet we may be under GFP_NOFS context + * here. Hence we need to tell memory reclaim that we are in such a + * context via PF_MEMALLOC_NOIO to prevent memory reclaim re-entering + * the filesystem here and potentially deadlocking. + */ + if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS)) + noio_flag = memalloc_noio_save(); + + lflags = kmem_flags_convert(flags); + ptr = __vmalloc(size, lflags | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL); + + if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS)) + memalloc_noio_restore(noio_flag); + + return ptr; +} + +void * +kmem_realloc(const void *ptr, size_t newsize, size_t oldsize, + xfs_km_flags_t flags) +{ + void *new; + + new = kmem_alloc(newsize, flags); + if (ptr) { + if (new) + memcpy(new, ptr, + ((oldsize < newsize) ? oldsize : newsize)); + kmem_free(ptr); + } + return new; +} + +void * +kmem_zone_alloc(kmem_zone_t *zone, xfs_km_flags_t flags) +{ + int retries = 0; + gfp_t lflags = kmem_flags_convert(flags); + void *ptr; + + do { + ptr = kmem_cache_alloc(zone, lflags); + if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP))) + return ptr; + if (!(++retries % 100)) + xfs_err(NULL, + "possible memory allocation deadlock in %s (mode:0x%x)", + __func__, lflags); + congestion_wait(BLK_RW_ASYNC, HZ/50); + } while (1); +} diff --git a/kernel/fs/xfs/kmem.h b/kernel/fs/xfs/kmem.h new file mode 100644 index 000000000..cc6b768fc --- /dev/null +++ b/kernel/fs/xfs/kmem.h @@ -0,0 +1,125 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_SUPPORT_KMEM_H__ +#define __XFS_SUPPORT_KMEM_H__ + +#include +#include +#include +#include + +/* + * General memory allocation interfaces + */ + +typedef unsigned __bitwise xfs_km_flags_t; +#define KM_SLEEP ((__force xfs_km_flags_t)0x0001u) +#define KM_NOSLEEP ((__force xfs_km_flags_t)0x0002u) +#define KM_NOFS ((__force xfs_km_flags_t)0x0004u) +#define KM_MAYFAIL ((__force xfs_km_flags_t)0x0008u) +#define KM_ZERO ((__force xfs_km_flags_t)0x0010u) + +/* + * We use a special process flag to avoid recursive callbacks into + * the filesystem during transactions. We will also issue our own + * warnings, so we explicitly skip any generic ones (silly of us). + */ +static inline gfp_t +kmem_flags_convert(xfs_km_flags_t flags) +{ + gfp_t lflags; + + BUG_ON(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS|KM_MAYFAIL|KM_ZERO)); + + if (flags & KM_NOSLEEP) { + lflags = GFP_ATOMIC | __GFP_NOWARN; + } else { + lflags = GFP_KERNEL | __GFP_NOWARN; + if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS)) + lflags &= ~__GFP_FS; + } + + if (flags & KM_ZERO) + lflags |= __GFP_ZERO; + + return lflags; +} + +extern void *kmem_alloc(size_t, xfs_km_flags_t); +extern void *kmem_zalloc_large(size_t size, xfs_km_flags_t); +extern void *kmem_realloc(const void *, size_t, size_t, xfs_km_flags_t); +static inline void kmem_free(const void *ptr) +{ + kvfree(ptr); +} + + +extern void *kmem_zalloc_greedy(size_t *, size_t, size_t); + +static inline void * +kmem_zalloc(size_t size, xfs_km_flags_t flags) +{ + return kmem_alloc(size, flags | KM_ZERO); +} + +/* + * Zone interfaces + */ + +#define KM_ZONE_HWALIGN SLAB_HWCACHE_ALIGN +#define KM_ZONE_RECLAIM SLAB_RECLAIM_ACCOUNT +#define KM_ZONE_SPREAD SLAB_MEM_SPREAD + +#define kmem_zone kmem_cache +#define kmem_zone_t struct kmem_cache + +static inline kmem_zone_t * +kmem_zone_init(int size, char *zone_name) +{ + return kmem_cache_create(zone_name, size, 0, 0, NULL); +} + +static inline kmem_zone_t * +kmem_zone_init_flags(int size, char *zone_name, unsigned long flags, + void (*construct)(void *)) +{ + return kmem_cache_create(zone_name, size, 0, flags, construct); +} + +static inline void +kmem_zone_free(kmem_zone_t *zone, void *ptr) +{ + kmem_cache_free(zone, ptr); +} + +static inline void +kmem_zone_destroy(kmem_zone_t *zone) +{ + if (zone) + kmem_cache_destroy(zone); +} + +extern void *kmem_zone_alloc(kmem_zone_t *, xfs_km_flags_t); + +static inline void * +kmem_zone_zalloc(kmem_zone_t *zone, xfs_km_flags_t flags) +{ + return kmem_zone_alloc(zone, flags | KM_ZERO); +} + +#endif /* __XFS_SUPPORT_KMEM_H__ */ diff --git a/kernel/fs/xfs/libxfs/xfs_alloc.c b/kernel/fs/xfs/libxfs/xfs_alloc.c new file mode 100644 index 000000000..516162be1 --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_alloc.c @@ -0,0 +1,2639 @@ +/* + * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_shared.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_btree.h" +#include "xfs_alloc_btree.h" +#include "xfs_alloc.h" +#include "xfs_extent_busy.h" +#include "xfs_error.h" +#include "xfs_cksum.h" +#include "xfs_trace.h" +#include "xfs_trans.h" +#include "xfs_buf_item.h" +#include "xfs_log.h" + +struct workqueue_struct *xfs_alloc_wq; + +#define XFS_ABSDIFF(a,b) (((a) <= (b)) ? ((b) - (a)) : ((a) - (b))) + +#define XFSA_FIXUP_BNO_OK 1 +#define XFSA_FIXUP_CNT_OK 2 + +STATIC int xfs_alloc_ag_vextent_exact(xfs_alloc_arg_t *); +STATIC int xfs_alloc_ag_vextent_near(xfs_alloc_arg_t *); +STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *); +STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *, + xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *); + +/* + * Lookup the record equal to [bno, len] in the btree given by cur. + */ +STATIC int /* error */ +xfs_alloc_lookup_eq( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agblock_t bno, /* starting block of extent */ + xfs_extlen_t len, /* length of extent */ + int *stat) /* success/failure */ +{ + cur->bc_rec.a.ar_startblock = bno; + cur->bc_rec.a.ar_blockcount = len; + return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat); +} + +/* + * Lookup the first record greater than or equal to [bno, len] + * in the btree given by cur. + */ +int /* error */ +xfs_alloc_lookup_ge( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agblock_t bno, /* starting block of extent */ + xfs_extlen_t len, /* length of extent */ + int *stat) /* success/failure */ +{ + cur->bc_rec.a.ar_startblock = bno; + cur->bc_rec.a.ar_blockcount = len; + return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat); +} + +/* + * Lookup the first record less than or equal to [bno, len] + * in the btree given by cur. + */ +int /* error */ +xfs_alloc_lookup_le( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agblock_t bno, /* starting block of extent */ + xfs_extlen_t len, /* length of extent */ + int *stat) /* success/failure */ +{ + cur->bc_rec.a.ar_startblock = bno; + cur->bc_rec.a.ar_blockcount = len; + return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat); +} + +/* + * Update the record referred to by cur to the value given + * by [bno, len]. + * This either works (return 0) or gets an EFSCORRUPTED error. + */ +STATIC int /* error */ +xfs_alloc_update( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agblock_t bno, /* starting block of extent */ + xfs_extlen_t len) /* length of extent */ +{ + union xfs_btree_rec rec; + + rec.alloc.ar_startblock = cpu_to_be32(bno); + rec.alloc.ar_blockcount = cpu_to_be32(len); + return xfs_btree_update(cur, &rec); +} + +/* + * Get the data from the pointed-to record. + */ +int /* error */ +xfs_alloc_get_rec( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agblock_t *bno, /* output: starting block of extent */ + xfs_extlen_t *len, /* output: length of extent */ + int *stat) /* output: success/failure */ +{ + union xfs_btree_rec *rec; + int error; + + error = xfs_btree_get_rec(cur, &rec, stat); + if (!error && *stat == 1) { + *bno = be32_to_cpu(rec->alloc.ar_startblock); + *len = be32_to_cpu(rec->alloc.ar_blockcount); + } + return error; +} + +/* + * Compute aligned version of the found extent. + * Takes alignment and min length into account. + */ +STATIC void +xfs_alloc_compute_aligned( + xfs_alloc_arg_t *args, /* allocation argument structure */ + xfs_agblock_t foundbno, /* starting block in found extent */ + xfs_extlen_t foundlen, /* length in found extent */ + xfs_agblock_t *resbno, /* result block number */ + xfs_extlen_t *reslen) /* result length */ +{ + xfs_agblock_t bno; + xfs_extlen_t len; + + /* Trim busy sections out of found extent */ + xfs_extent_busy_trim(args, foundbno, foundlen, &bno, &len); + + if (args->alignment > 1 && len >= args->minlen) { + xfs_agblock_t aligned_bno = roundup(bno, args->alignment); + xfs_extlen_t diff = aligned_bno - bno; + + *resbno = aligned_bno; + *reslen = diff >= len ? 0 : len - diff; + } else { + *resbno = bno; + *reslen = len; + } +} + +/* + * Compute best start block and diff for "near" allocations. + * freelen >= wantlen already checked by caller. + */ +STATIC xfs_extlen_t /* difference value (absolute) */ +xfs_alloc_compute_diff( + xfs_agblock_t wantbno, /* target starting block */ + xfs_extlen_t wantlen, /* target length */ + xfs_extlen_t alignment, /* target alignment */ + char userdata, /* are we allocating data? */ + xfs_agblock_t freebno, /* freespace's starting block */ + xfs_extlen_t freelen, /* freespace's length */ + xfs_agblock_t *newbnop) /* result: best start block from free */ +{ + xfs_agblock_t freeend; /* end of freespace extent */ + xfs_agblock_t newbno1; /* return block number */ + xfs_agblock_t newbno2; /* other new block number */ + xfs_extlen_t newlen1=0; /* length with newbno1 */ + xfs_extlen_t newlen2=0; /* length with newbno2 */ + xfs_agblock_t wantend; /* end of target extent */ + + ASSERT(freelen >= wantlen); + freeend = freebno + freelen; + wantend = wantbno + wantlen; + /* + * We want to allocate from the start of a free extent if it is past + * the desired block or if we are allocating user data and the free + * extent is before desired block. The second case is there to allow + * for contiguous allocation from the remaining free space if the file + * grows in the short term. + */ + if (freebno >= wantbno || (userdata && freeend < wantend)) { + if ((newbno1 = roundup(freebno, alignment)) >= freeend) + newbno1 = NULLAGBLOCK; + } else if (freeend >= wantend && alignment > 1) { + newbno1 = roundup(wantbno, alignment); + newbno2 = newbno1 - alignment; + if (newbno1 >= freeend) + newbno1 = NULLAGBLOCK; + else + newlen1 = XFS_EXTLEN_MIN(wantlen, freeend - newbno1); + if (newbno2 < freebno) + newbno2 = NULLAGBLOCK; + else + newlen2 = XFS_EXTLEN_MIN(wantlen, freeend - newbno2); + if (newbno1 != NULLAGBLOCK && newbno2 != NULLAGBLOCK) { + if (newlen1 < newlen2 || + (newlen1 == newlen2 && + XFS_ABSDIFF(newbno1, wantbno) > + XFS_ABSDIFF(newbno2, wantbno))) + newbno1 = newbno2; + } else if (newbno2 != NULLAGBLOCK) + newbno1 = newbno2; + } else if (freeend >= wantend) { + newbno1 = wantbno; + } else if (alignment > 1) { + newbno1 = roundup(freeend - wantlen, alignment); + if (newbno1 > freeend - wantlen && + newbno1 - alignment >= freebno) + newbno1 -= alignment; + else if (newbno1 >= freeend) + newbno1 = NULLAGBLOCK; + } else + newbno1 = freeend - wantlen; + *newbnop = newbno1; + return newbno1 == NULLAGBLOCK ? 0 : XFS_ABSDIFF(newbno1, wantbno); +} + +/* + * Fix up the length, based on mod and prod. + * len should be k * prod + mod for some k. + * If len is too small it is returned unchanged. + * If len hits maxlen it is left alone. + */ +STATIC void +xfs_alloc_fix_len( + xfs_alloc_arg_t *args) /* allocation argument structure */ +{ + xfs_extlen_t k; + xfs_extlen_t rlen; + + ASSERT(args->mod < args->prod); + rlen = args->len; + ASSERT(rlen >= args->minlen); + ASSERT(rlen <= args->maxlen); + if (args->prod <= 1 || rlen < args->mod || rlen == args->maxlen || + (args->mod == 0 && rlen < args->prod)) + return; + k = rlen % args->prod; + if (k == args->mod) + return; + if (k > args->mod) + rlen = rlen - (k - args->mod); + else + rlen = rlen - args->prod + (args->mod - k); + /* casts to (int) catch length underflows */ + if ((int)rlen < (int)args->minlen) + return; + ASSERT(rlen >= args->minlen && rlen <= args->maxlen); + ASSERT(rlen % args->prod == args->mod); + args->len = rlen; +} + +/* + * Fix up length if there is too little space left in the a.g. + * Return 1 if ok, 0 if too little, should give up. + */ +STATIC int +xfs_alloc_fix_minleft( + xfs_alloc_arg_t *args) /* allocation argument structure */ +{ + xfs_agf_t *agf; /* a.g. freelist header */ + int diff; /* free space difference */ + + if (args->minleft == 0) + return 1; + agf = XFS_BUF_TO_AGF(args->agbp); + diff = be32_to_cpu(agf->agf_freeblks) + - args->len - args->minleft; + if (diff >= 0) + return 1; + args->len += diff; /* shrink the allocated space */ + /* casts to (int) catch length underflows */ + if ((int)args->len >= (int)args->minlen) + return 1; + args->agbno = NULLAGBLOCK; + return 0; +} + +/* + * Update the two btrees, logically removing from freespace the extent + * starting at rbno, rlen blocks. The extent is contained within the + * actual (current) free extent fbno for flen blocks. + * Flags are passed in indicating whether the cursors are set to the + * relevant records. + */ +STATIC int /* error code */ +xfs_alloc_fixup_trees( + xfs_btree_cur_t *cnt_cur, /* cursor for by-size btree */ + xfs_btree_cur_t *bno_cur, /* cursor for by-block btree */ + xfs_agblock_t fbno, /* starting block of free extent */ + xfs_extlen_t flen, /* length of free extent */ + xfs_agblock_t rbno, /* starting block of returned extent */ + xfs_extlen_t rlen, /* length of returned extent */ + int flags) /* flags, XFSA_FIXUP_... */ +{ + int error; /* error code */ + int i; /* operation results */ + xfs_agblock_t nfbno1; /* first new free startblock */ + xfs_agblock_t nfbno2; /* second new free startblock */ + xfs_extlen_t nflen1=0; /* first new free length */ + xfs_extlen_t nflen2=0; /* second new free length */ + struct xfs_mount *mp; + + mp = cnt_cur->bc_mp; + + /* + * Look up the record in the by-size tree if necessary. + */ + if (flags & XFSA_FIXUP_CNT_OK) { +#ifdef DEBUG + if ((error = xfs_alloc_get_rec(cnt_cur, &nfbno1, &nflen1, &i))) + return error; + XFS_WANT_CORRUPTED_RETURN(mp, + i == 1 && nfbno1 == fbno && nflen1 == flen); +#endif + } else { + if ((error = xfs_alloc_lookup_eq(cnt_cur, fbno, flen, &i))) + return error; + XFS_WANT_CORRUPTED_RETURN(mp, i == 1); + } + /* + * Look up the record in the by-block tree if necessary. + */ + if (flags & XFSA_FIXUP_BNO_OK) { +#ifdef DEBUG + if ((error = xfs_alloc_get_rec(bno_cur, &nfbno1, &nflen1, &i))) + return error; + XFS_WANT_CORRUPTED_RETURN(mp, + i == 1 && nfbno1 == fbno && nflen1 == flen); +#endif + } else { + if ((error = xfs_alloc_lookup_eq(bno_cur, fbno, flen, &i))) + return error; + XFS_WANT_CORRUPTED_RETURN(mp, i == 1); + } + +#ifdef DEBUG + if (bno_cur->bc_nlevels == 1 && cnt_cur->bc_nlevels == 1) { + struct xfs_btree_block *bnoblock; + struct xfs_btree_block *cntblock; + + bnoblock = XFS_BUF_TO_BLOCK(bno_cur->bc_bufs[0]); + cntblock = XFS_BUF_TO_BLOCK(cnt_cur->bc_bufs[0]); + + XFS_WANT_CORRUPTED_RETURN(mp, + bnoblock->bb_numrecs == cntblock->bb_numrecs); + } +#endif + + /* + * Deal with all four cases: the allocated record is contained + * within the freespace record, so we can have new freespace + * at either (or both) end, or no freespace remaining. + */ + if (rbno == fbno && rlen == flen) + nfbno1 = nfbno2 = NULLAGBLOCK; + else if (rbno == fbno) { + nfbno1 = rbno + rlen; + nflen1 = flen - rlen; + nfbno2 = NULLAGBLOCK; + } else if (rbno + rlen == fbno + flen) { + nfbno1 = fbno; + nflen1 = flen - rlen; + nfbno2 = NULLAGBLOCK; + } else { + nfbno1 = fbno; + nflen1 = rbno - fbno; + nfbno2 = rbno + rlen; + nflen2 = (fbno + flen) - nfbno2; + } + /* + * Delete the entry from the by-size btree. + */ + if ((error = xfs_btree_delete(cnt_cur, &i))) + return error; + XFS_WANT_CORRUPTED_RETURN(mp, i == 1); + /* + * Add new by-size btree entry(s). + */ + if (nfbno1 != NULLAGBLOCK) { + if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno1, nflen1, &i))) + return error; + XFS_WANT_CORRUPTED_RETURN(mp, i == 0); + if ((error = xfs_btree_insert(cnt_cur, &i))) + return error; + XFS_WANT_CORRUPTED_RETURN(mp, i == 1); + } + if (nfbno2 != NULLAGBLOCK) { + if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno2, nflen2, &i))) + return error; + XFS_WANT_CORRUPTED_RETURN(mp, i == 0); + if ((error = xfs_btree_insert(cnt_cur, &i))) + return error; + XFS_WANT_CORRUPTED_RETURN(mp, i == 1); + } + /* + * Fix up the by-block btree entry(s). + */ + if (nfbno1 == NULLAGBLOCK) { + /* + * No remaining freespace, just delete the by-block tree entry. + */ + if ((error = xfs_btree_delete(bno_cur, &i))) + return error; + XFS_WANT_CORRUPTED_RETURN(mp, i == 1); + } else { + /* + * Update the by-block entry to start later|be shorter. + */ + if ((error = xfs_alloc_update(bno_cur, nfbno1, nflen1))) + return error; + } + if (nfbno2 != NULLAGBLOCK) { + /* + * 2 resulting free entries, need to add one. + */ + if ((error = xfs_alloc_lookup_eq(bno_cur, nfbno2, nflen2, &i))) + return error; + XFS_WANT_CORRUPTED_RETURN(mp, i == 0); + if ((error = xfs_btree_insert(bno_cur, &i))) + return error; + XFS_WANT_CORRUPTED_RETURN(mp, i == 1); + } + return 0; +} + +static bool +xfs_agfl_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_agfl *agfl = XFS_BUF_TO_AGFL(bp); + int i; + + if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_uuid)) + return false; + if (be32_to_cpu(agfl->agfl_magicnum) != XFS_AGFL_MAGIC) + return false; + /* + * during growfs operations, the perag is not fully initialised, + * so we can't use it for any useful checking. growfs ensures we can't + * use it by using uncached buffers that don't have the perag attached + * so we can detect and avoid this problem. + */ + if (bp->b_pag && be32_to_cpu(agfl->agfl_seqno) != bp->b_pag->pag_agno) + return false; + + for (i = 0; i < XFS_AGFL_SIZE(mp); i++) { + if (be32_to_cpu(agfl->agfl_bno[i]) != NULLAGBLOCK && + be32_to_cpu(agfl->agfl_bno[i]) >= mp->m_sb.sb_agblocks) + return false; + } + return true; +} + +static void +xfs_agfl_read_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + + /* + * There is no verification of non-crc AGFLs because mkfs does not + * initialise the AGFL to zero or NULL. Hence the only valid part of the + * AGFL is what the AGF says is active. We can't get to the AGF, so we + * can't verify just those entries are valid. + */ + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (!xfs_buf_verify_cksum(bp, XFS_AGFL_CRC_OFF)) + xfs_buf_ioerror(bp, -EFSBADCRC); + else if (!xfs_agfl_verify(bp)) + xfs_buf_ioerror(bp, -EFSCORRUPTED); + + if (bp->b_error) + xfs_verifier_error(bp); +} + +static void +xfs_agfl_write_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; + + /* no verification of non-crc AGFLs */ + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (!xfs_agfl_verify(bp)) { + xfs_buf_ioerror(bp, -EFSCORRUPTED); + xfs_verifier_error(bp); + return; + } + + if (bip) + XFS_BUF_TO_AGFL(bp)->agfl_lsn = cpu_to_be64(bip->bli_item.li_lsn); + + xfs_buf_update_cksum(bp, XFS_AGFL_CRC_OFF); +} + +const struct xfs_buf_ops xfs_agfl_buf_ops = { + .verify_read = xfs_agfl_read_verify, + .verify_write = xfs_agfl_write_verify, +}; + +/* + * Read in the allocation group free block array. + */ +STATIC int /* error */ +xfs_alloc_read_agfl( + xfs_mount_t *mp, /* mount point structure */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_agnumber_t agno, /* allocation group number */ + xfs_buf_t **bpp) /* buffer for the ag free block array */ +{ + xfs_buf_t *bp; /* return value */ + int error; + + ASSERT(agno != NULLAGNUMBER); + error = xfs_trans_read_buf( + mp, tp, mp->m_ddev_targp, + XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)), + XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops); + if (error) + return error; + xfs_buf_set_ref(bp, XFS_AGFL_REF); + *bpp = bp; + return 0; +} + +STATIC int +xfs_alloc_update_counters( + struct xfs_trans *tp, + struct xfs_perag *pag, + struct xfs_buf *agbp, + long len) +{ + struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + + pag->pagf_freeblks += len; + be32_add_cpu(&agf->agf_freeblks, len); + + xfs_trans_agblocks_delta(tp, len); + if (unlikely(be32_to_cpu(agf->agf_freeblks) > + be32_to_cpu(agf->agf_length))) + return -EFSCORRUPTED; + + xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS); + return 0; +} + +/* + * Allocation group level functions. + */ + +/* + * Allocate a variable extent in the allocation group agno. + * Type and bno are used to determine where in the allocation group the + * extent will start. + * Extent's length (returned in *len) will be between minlen and maxlen, + * and of the form k * prod + mod unless there's nothing that large. + * Return the starting a.g. block, or NULLAGBLOCK if we can't do it. + */ +STATIC int /* error */ +xfs_alloc_ag_vextent( + xfs_alloc_arg_t *args) /* argument structure for allocation */ +{ + int error=0; + + ASSERT(args->minlen > 0); + ASSERT(args->maxlen > 0); + ASSERT(args->minlen <= args->maxlen); + ASSERT(args->mod < args->prod); + ASSERT(args->alignment > 0); + /* + * Branch to correct routine based on the type. + */ + args->wasfromfl = 0; + switch (args->type) { + case XFS_ALLOCTYPE_THIS_AG: + error = xfs_alloc_ag_vextent_size(args); + break; + case XFS_ALLOCTYPE_NEAR_BNO: + error = xfs_alloc_ag_vextent_near(args); + break; + case XFS_ALLOCTYPE_THIS_BNO: + error = xfs_alloc_ag_vextent_exact(args); + break; + default: + ASSERT(0); + /* NOTREACHED */ + } + + if (error || args->agbno == NULLAGBLOCK) + return error; + + ASSERT(args->len >= args->minlen); + ASSERT(args->len <= args->maxlen); + ASSERT(!args->wasfromfl || !args->isfl); + ASSERT(args->agbno % args->alignment == 0); + + if (!args->wasfromfl) { + error = xfs_alloc_update_counters(args->tp, args->pag, + args->agbp, + -((long)(args->len))); + if (error) + return error; + + ASSERT(!xfs_extent_busy_search(args->mp, args->agno, + args->agbno, args->len)); + } + + if (!args->isfl) { + xfs_trans_mod_sb(args->tp, args->wasdel ? + XFS_TRANS_SB_RES_FDBLOCKS : + XFS_TRANS_SB_FDBLOCKS, + -((long)(args->len))); + } + + XFS_STATS_INC(xs_allocx); + XFS_STATS_ADD(xs_allocb, args->len); + return error; +} + +/* + * Allocate a variable extent at exactly agno/bno. + * Extent's length (returned in *len) will be between minlen and maxlen, + * and of the form k * prod + mod unless there's nothing that large. + * Return the starting a.g. block (bno), or NULLAGBLOCK if we can't do it. + */ +STATIC int /* error */ +xfs_alloc_ag_vextent_exact( + xfs_alloc_arg_t *args) /* allocation argument structure */ +{ + xfs_btree_cur_t *bno_cur;/* by block-number btree cursor */ + xfs_btree_cur_t *cnt_cur;/* by count btree cursor */ + int error; + xfs_agblock_t fbno; /* start block of found extent */ + xfs_extlen_t flen; /* length of found extent */ + xfs_agblock_t tbno; /* start block of trimmed extent */ + xfs_extlen_t tlen; /* length of trimmed extent */ + xfs_agblock_t tend; /* end block of trimmed extent */ + int i; /* success/failure of operation */ + + ASSERT(args->alignment == 1); + + /* + * Allocate/initialize a cursor for the by-number freespace btree. + */ + bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, + args->agno, XFS_BTNUM_BNO); + + /* + * Lookup bno and minlen in the btree (minlen is irrelevant, really). + * Look for the closest free block <= bno, it must contain bno + * if any free block does. + */ + error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i); + if (error) + goto error0; + if (!i) + goto not_found; + + /* + * Grab the freespace record. + */ + error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i); + if (error) + goto error0; + XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); + ASSERT(fbno <= args->agbno); + + /* + * Check for overlapping busy extents. + */ + xfs_extent_busy_trim(args, fbno, flen, &tbno, &tlen); + + /* + * Give up if the start of the extent is busy, or the freespace isn't + * long enough for the minimum request. + */ + if (tbno > args->agbno) + goto not_found; + if (tlen < args->minlen) + goto not_found; + tend = tbno + tlen; + if (tend < args->agbno + args->minlen) + goto not_found; + + /* + * End of extent will be smaller of the freespace end and the + * maximal requested end. + * + * Fix the length according to mod and prod if given. + */ + args->len = XFS_AGBLOCK_MIN(tend, args->agbno + args->maxlen) + - args->agbno; + xfs_alloc_fix_len(args); + if (!xfs_alloc_fix_minleft(args)) + goto not_found; + + ASSERT(args->agbno + args->len <= tend); + + /* + * We are allocating agbno for args->len + * Allocate/initialize a cursor for the by-size btree. + */ + cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, + args->agno, XFS_BTNUM_CNT); + ASSERT(args->agbno + args->len <= + be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); + error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno, + args->len, XFSA_FIXUP_BNO_OK); + if (error) { + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR); + goto error0; + } + + xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); + + args->wasfromfl = 0; + trace_xfs_alloc_exact_done(args); + return 0; + +not_found: + /* Didn't find it, return null. */ + xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); + args->agbno = NULLAGBLOCK; + trace_xfs_alloc_exact_notfound(args); + return 0; + +error0: + xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR); + trace_xfs_alloc_exact_error(args); + return error; +} + +/* + * Search the btree in a given direction via the search cursor and compare + * the records found against the good extent we've already found. + */ +STATIC int +xfs_alloc_find_best_extent( + struct xfs_alloc_arg *args, /* allocation argument structure */ + struct xfs_btree_cur **gcur, /* good cursor */ + struct xfs_btree_cur **scur, /* searching cursor */ + xfs_agblock_t gdiff, /* difference for search comparison */ + xfs_agblock_t *sbno, /* extent found by search */ + xfs_extlen_t *slen, /* extent length */ + xfs_agblock_t *sbnoa, /* aligned extent found by search */ + xfs_extlen_t *slena, /* aligned extent length */ + int dir) /* 0 = search right, 1 = search left */ +{ + xfs_agblock_t new; + xfs_agblock_t sdiff; + int error; + int i; + + /* The good extent is perfect, no need to search. */ + if (!gdiff) + goto out_use_good; + + /* + * Look until we find a better one, run out of space or run off the end. + */ + do { + error = xfs_alloc_get_rec(*scur, sbno, slen, &i); + if (error) + goto error0; + XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); + xfs_alloc_compute_aligned(args, *sbno, *slen, sbnoa, slena); + + /* + * The good extent is closer than this one. + */ + if (!dir) { + if (*sbnoa >= args->agbno + gdiff) + goto out_use_good; + } else { + if (*sbnoa <= args->agbno - gdiff) + goto out_use_good; + } + + /* + * Same distance, compare length and pick the best. + */ + if (*slena >= args->minlen) { + args->len = XFS_EXTLEN_MIN(*slena, args->maxlen); + xfs_alloc_fix_len(args); + + sdiff = xfs_alloc_compute_diff(args->agbno, args->len, + args->alignment, + args->userdata, *sbnoa, + *slena, &new); + + /* + * Choose closer size and invalidate other cursor. + */ + if (sdiff < gdiff) + goto out_use_search; + goto out_use_good; + } + + if (!dir) + error = xfs_btree_increment(*scur, 0, &i); + else + error = xfs_btree_decrement(*scur, 0, &i); + if (error) + goto error0; + } while (i); + +out_use_good: + xfs_btree_del_cursor(*scur, XFS_BTREE_NOERROR); + *scur = NULL; + return 0; + +out_use_search: + xfs_btree_del_cursor(*gcur, XFS_BTREE_NOERROR); + *gcur = NULL; + return 0; + +error0: + /* caller invalidates cursors */ + return error; +} + +/* + * Allocate a variable extent near bno in the allocation group agno. + * Extent's length (returned in len) will be between minlen and maxlen, + * and of the form k * prod + mod unless there's nothing that large. + * Return the starting a.g. block, or NULLAGBLOCK if we can't do it. + */ +STATIC int /* error */ +xfs_alloc_ag_vextent_near( + xfs_alloc_arg_t *args) /* allocation argument structure */ +{ + xfs_btree_cur_t *bno_cur_gt; /* cursor for bno btree, right side */ + xfs_btree_cur_t *bno_cur_lt; /* cursor for bno btree, left side */ + xfs_btree_cur_t *cnt_cur; /* cursor for count btree */ + xfs_agblock_t gtbno; /* start bno of right side entry */ + xfs_agblock_t gtbnoa; /* aligned ... */ + xfs_extlen_t gtdiff; /* difference to right side entry */ + xfs_extlen_t gtlen; /* length of right side entry */ + xfs_extlen_t gtlena; /* aligned ... */ + xfs_agblock_t gtnew; /* useful start bno of right side */ + int error; /* error code */ + int i; /* result code, temporary */ + int j; /* result code, temporary */ + xfs_agblock_t ltbno; /* start bno of left side entry */ + xfs_agblock_t ltbnoa; /* aligned ... */ + xfs_extlen_t ltdiff; /* difference to left side entry */ + xfs_extlen_t ltlen; /* length of left side entry */ + xfs_extlen_t ltlena; /* aligned ... */ + xfs_agblock_t ltnew; /* useful start bno of left side */ + xfs_extlen_t rlen; /* length of returned extent */ + int forced = 0; +#ifdef DEBUG + /* + * Randomly don't execute the first algorithm. + */ + int dofirst; /* set to do first algorithm */ + + dofirst = prandom_u32() & 1; +#endif + +restart: + bno_cur_lt = NULL; + bno_cur_gt = NULL; + ltlen = 0; + gtlena = 0; + ltlena = 0; + + /* + * Get a cursor for the by-size btree. + */ + cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, + args->agno, XFS_BTNUM_CNT); + + /* + * See if there are any free extents as big as maxlen. + */ + if ((error = xfs_alloc_lookup_ge(cnt_cur, 0, args->maxlen, &i))) + goto error0; + /* + * If none, then pick up the last entry in the tree unless the + * tree is empty. + */ + if (!i) { + if ((error = xfs_alloc_ag_vextent_small(args, cnt_cur, <bno, + <len, &i))) + goto error0; + if (i == 0 || ltlen == 0) { + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); + trace_xfs_alloc_near_noentry(args); + return 0; + } + ASSERT(i == 1); + } + args->wasfromfl = 0; + + /* + * First algorithm. + * If the requested extent is large wrt the freespaces available + * in this a.g., then the cursor will be pointing to a btree entry + * near the right edge of the tree. If it's in the last btree leaf + * block, then we just examine all the entries in that block + * that are big enough, and pick the best one. + * This is written as a while loop so we can break out of it, + * but we never loop back to the top. + */ + while (xfs_btree_islastblock(cnt_cur, 0)) { + xfs_extlen_t bdiff; + int besti=0; + xfs_extlen_t blen=0; + xfs_agblock_t bnew=0; + +#ifdef DEBUG + if (dofirst) + break; +#endif + /* + * Start from the entry that lookup found, sequence through + * all larger free blocks. If we're actually pointing at a + * record smaller than maxlen, go to the start of this block, + * and skip all those smaller than minlen. + */ + if (ltlen || args->alignment > 1) { + cnt_cur->bc_ptrs[0] = 1; + do { + if ((error = xfs_alloc_get_rec(cnt_cur, <bno, + <len, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); + if (ltlen >= args->minlen) + break; + if ((error = xfs_btree_increment(cnt_cur, 0, &i))) + goto error0; + } while (i); + ASSERT(ltlen >= args->minlen); + if (!i) + break; + } + i = cnt_cur->bc_ptrs[0]; + for (j = 1, blen = 0, bdiff = 0; + !error && j && (blen < args->maxlen || bdiff > 0); + error = xfs_btree_increment(cnt_cur, 0, &j)) { + /* + * For each entry, decide if it's better than + * the previous best entry. + */ + if ((error = xfs_alloc_get_rec(cnt_cur, <bno, <len, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); + xfs_alloc_compute_aligned(args, ltbno, ltlen, + <bnoa, <lena); + if (ltlena < args->minlen) + continue; + args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen); + xfs_alloc_fix_len(args); + ASSERT(args->len >= args->minlen); + if (args->len < blen) + continue; + ltdiff = xfs_alloc_compute_diff(args->agbno, args->len, + args->alignment, args->userdata, ltbnoa, + ltlena, <new); + if (ltnew != NULLAGBLOCK && + (args->len > blen || ltdiff < bdiff)) { + bdiff = ltdiff; + bnew = ltnew; + blen = args->len; + besti = cnt_cur->bc_ptrs[0]; + } + } + /* + * It didn't work. We COULD be in a case where + * there's a good record somewhere, so try again. + */ + if (blen == 0) + break; + /* + * Point at the best entry, and retrieve it again. + */ + cnt_cur->bc_ptrs[0] = besti; + if ((error = xfs_alloc_get_rec(cnt_cur, <bno, <len, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); + ASSERT(ltbno + ltlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); + args->len = blen; + if (!xfs_alloc_fix_minleft(args)) { + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); + trace_xfs_alloc_near_nominleft(args); + return 0; + } + blen = args->len; + /* + * We are allocating starting at bnew for blen blocks. + */ + args->agbno = bnew; + ASSERT(bnew >= ltbno); + ASSERT(bnew + blen <= ltbno + ltlen); + /* + * Set up a cursor for the by-bno tree. + */ + bno_cur_lt = xfs_allocbt_init_cursor(args->mp, args->tp, + args->agbp, args->agno, XFS_BTNUM_BNO); + /* + * Fix up the btree entries. + */ + if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, + ltlen, bnew, blen, XFSA_FIXUP_CNT_OK))) + goto error0; + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); + xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR); + + trace_xfs_alloc_near_first(args); + return 0; + } + /* + * Second algorithm. + * Search in the by-bno tree to the left and to the right + * simultaneously, until in each case we find a space big enough, + * or run into the edge of the tree. When we run into the edge, + * we deallocate that cursor. + * If both searches succeed, we compare the two spaces and pick + * the better one. + * With alignment, it's possible for both to fail; the upper + * level algorithm that picks allocation groups for allocations + * is not supposed to do this. + */ + /* + * Allocate and initialize the cursor for the leftward search. + */ + bno_cur_lt = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, + args->agno, XFS_BTNUM_BNO); + /* + * Lookup <= bno to find the leftward search's starting point. + */ + if ((error = xfs_alloc_lookup_le(bno_cur_lt, args->agbno, args->maxlen, &i))) + goto error0; + if (!i) { + /* + * Didn't find anything; use this cursor for the rightward + * search. + */ + bno_cur_gt = bno_cur_lt; + bno_cur_lt = NULL; + } + /* + * Found something. Duplicate the cursor for the rightward search. + */ + else if ((error = xfs_btree_dup_cursor(bno_cur_lt, &bno_cur_gt))) + goto error0; + /* + * Increment the cursor, so we will point at the entry just right + * of the leftward entry if any, or to the leftmost entry. + */ + if ((error = xfs_btree_increment(bno_cur_gt, 0, &i))) + goto error0; + if (!i) { + /* + * It failed, there are no rightward entries. + */ + xfs_btree_del_cursor(bno_cur_gt, XFS_BTREE_NOERROR); + bno_cur_gt = NULL; + } + /* + * Loop going left with the leftward cursor, right with the + * rightward cursor, until either both directions give up or + * we find an entry at least as big as minlen. + */ + do { + if (bno_cur_lt) { + if ((error = xfs_alloc_get_rec(bno_cur_lt, <bno, <len, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); + xfs_alloc_compute_aligned(args, ltbno, ltlen, + <bnoa, <lena); + if (ltlena >= args->minlen) + break; + if ((error = xfs_btree_decrement(bno_cur_lt, 0, &i))) + goto error0; + if (!i) { + xfs_btree_del_cursor(bno_cur_lt, + XFS_BTREE_NOERROR); + bno_cur_lt = NULL; + } + } + if (bno_cur_gt) { + if ((error = xfs_alloc_get_rec(bno_cur_gt, >bno, >len, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); + xfs_alloc_compute_aligned(args, gtbno, gtlen, + >bnoa, >lena); + if (gtlena >= args->minlen) + break; + if ((error = xfs_btree_increment(bno_cur_gt, 0, &i))) + goto error0; + if (!i) { + xfs_btree_del_cursor(bno_cur_gt, + XFS_BTREE_NOERROR); + bno_cur_gt = NULL; + } + } + } while (bno_cur_lt || bno_cur_gt); + + /* + * Got both cursors still active, need to find better entry. + */ + if (bno_cur_lt && bno_cur_gt) { + if (ltlena >= args->minlen) { + /* + * Left side is good, look for a right side entry. + */ + args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen); + xfs_alloc_fix_len(args); + ltdiff = xfs_alloc_compute_diff(args->agbno, args->len, + args->alignment, args->userdata, ltbnoa, + ltlena, <new); + + error = xfs_alloc_find_best_extent(args, + &bno_cur_lt, &bno_cur_gt, + ltdiff, >bno, >len, + >bnoa, >lena, + 0 /* search right */); + } else { + ASSERT(gtlena >= args->minlen); + + /* + * Right side is good, look for a left side entry. + */ + args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen); + xfs_alloc_fix_len(args); + gtdiff = xfs_alloc_compute_diff(args->agbno, args->len, + args->alignment, args->userdata, gtbnoa, + gtlena, >new); + + error = xfs_alloc_find_best_extent(args, + &bno_cur_gt, &bno_cur_lt, + gtdiff, <bno, <len, + <bnoa, <lena, + 1 /* search left */); + } + + if (error) + goto error0; + } + + /* + * If we couldn't get anything, give up. + */ + if (bno_cur_lt == NULL && bno_cur_gt == NULL) { + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); + + if (!forced++) { + trace_xfs_alloc_near_busy(args); + xfs_log_force(args->mp, XFS_LOG_SYNC); + goto restart; + } + trace_xfs_alloc_size_neither(args); + args->agbno = NULLAGBLOCK; + return 0; + } + + /* + * At this point we have selected a freespace entry, either to the + * left or to the right. If it's on the right, copy all the + * useful variables to the "left" set so we only have one + * copy of this code. + */ + if (bno_cur_gt) { + bno_cur_lt = bno_cur_gt; + bno_cur_gt = NULL; + ltbno = gtbno; + ltbnoa = gtbnoa; + ltlen = gtlen; + ltlena = gtlena; + j = 1; + } else + j = 0; + + /* + * Fix up the length and compute the useful address. + */ + args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen); + xfs_alloc_fix_len(args); + if (!xfs_alloc_fix_minleft(args)) { + trace_xfs_alloc_near_nominleft(args); + xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR); + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); + return 0; + } + rlen = args->len; + (void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment, + args->userdata, ltbnoa, ltlena, <new); + ASSERT(ltnew >= ltbno); + ASSERT(ltnew + rlen <= ltbnoa + ltlena); + ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); + args->agbno = ltnew; + + if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, ltlen, + ltnew, rlen, XFSA_FIXUP_BNO_OK))) + goto error0; + + if (j) + trace_xfs_alloc_near_greater(args); + else + trace_xfs_alloc_near_lesser(args); + + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); + xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR); + return 0; + + error0: + trace_xfs_alloc_near_error(args); + if (cnt_cur != NULL) + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR); + if (bno_cur_lt != NULL) + xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_ERROR); + if (bno_cur_gt != NULL) + xfs_btree_del_cursor(bno_cur_gt, XFS_BTREE_ERROR); + return error; +} + +/* + * Allocate a variable extent anywhere in the allocation group agno. + * Extent's length (returned in len) will be between minlen and maxlen, + * and of the form k * prod + mod unless there's nothing that large. + * Return the starting a.g. block, or NULLAGBLOCK if we can't do it. + */ +STATIC int /* error */ +xfs_alloc_ag_vextent_size( + xfs_alloc_arg_t *args) /* allocation argument structure */ +{ + xfs_btree_cur_t *bno_cur; /* cursor for bno btree */ + xfs_btree_cur_t *cnt_cur; /* cursor for cnt btree */ + int error; /* error result */ + xfs_agblock_t fbno; /* start of found freespace */ + xfs_extlen_t flen; /* length of found freespace */ + int i; /* temp status variable */ + xfs_agblock_t rbno; /* returned block number */ + xfs_extlen_t rlen; /* length of returned extent */ + int forced = 0; + +restart: + /* + * Allocate and initialize a cursor for the by-size btree. + */ + cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, + args->agno, XFS_BTNUM_CNT); + bno_cur = NULL; + + /* + * Look for an entry >= maxlen+alignment-1 blocks. + */ + if ((error = xfs_alloc_lookup_ge(cnt_cur, 0, + args->maxlen + args->alignment - 1, &i))) + goto error0; + + /* + * If none or we have busy extents that we cannot allocate from, then + * we have to settle for a smaller extent. In the case that there are + * no large extents, this will return the last entry in the tree unless + * the tree is empty. In the case that there are only busy large + * extents, this will return the largest small extent unless there + * are no smaller extents available. + */ + if (!i || forced > 1) { + error = xfs_alloc_ag_vextent_small(args, cnt_cur, + &fbno, &flen, &i); + if (error) + goto error0; + if (i == 0 || flen == 0) { + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); + trace_xfs_alloc_size_noentry(args); + return 0; + } + ASSERT(i == 1); + xfs_alloc_compute_aligned(args, fbno, flen, &rbno, &rlen); + } else { + /* + * Search for a non-busy extent that is large enough. + * If we are at low space, don't check, or if we fall of + * the end of the btree, turn off the busy check and + * restart. + */ + for (;;) { + error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i); + if (error) + goto error0; + XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); + + xfs_alloc_compute_aligned(args, fbno, flen, + &rbno, &rlen); + + if (rlen >= args->maxlen) + break; + + error = xfs_btree_increment(cnt_cur, 0, &i); + if (error) + goto error0; + if (i == 0) { + /* + * Our only valid extents must have been busy. + * Make it unbusy by forcing the log out and + * retrying. If we've been here before, forcing + * the log isn't making the extents available, + * which means they have probably been freed in + * this transaction. In that case, we have to + * give up on them and we'll attempt a minlen + * allocation the next time around. + */ + xfs_btree_del_cursor(cnt_cur, + XFS_BTREE_NOERROR); + trace_xfs_alloc_size_busy(args); + if (!forced++) + xfs_log_force(args->mp, XFS_LOG_SYNC); + goto restart; + } + } + } + + /* + * In the first case above, we got the last entry in the + * by-size btree. Now we check to see if the space hits maxlen + * once aligned; if not, we search left for something better. + * This can't happen in the second case above. + */ + rlen = XFS_EXTLEN_MIN(args->maxlen, rlen); + XFS_WANT_CORRUPTED_GOTO(args->mp, rlen == 0 || + (rlen <= flen && rbno + rlen <= fbno + flen), error0); + if (rlen < args->maxlen) { + xfs_agblock_t bestfbno; + xfs_extlen_t bestflen; + xfs_agblock_t bestrbno; + xfs_extlen_t bestrlen; + + bestrlen = rlen; + bestrbno = rbno; + bestflen = flen; + bestfbno = fbno; + for (;;) { + if ((error = xfs_btree_decrement(cnt_cur, 0, &i))) + goto error0; + if (i == 0) + break; + if ((error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, + &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); + if (flen < bestrlen) + break; + xfs_alloc_compute_aligned(args, fbno, flen, + &rbno, &rlen); + rlen = XFS_EXTLEN_MIN(args->maxlen, rlen); + XFS_WANT_CORRUPTED_GOTO(args->mp, rlen == 0 || + (rlen <= flen && rbno + rlen <= fbno + flen), + error0); + if (rlen > bestrlen) { + bestrlen = rlen; + bestrbno = rbno; + bestflen = flen; + bestfbno = fbno; + if (rlen == args->maxlen) + break; + } + } + if ((error = xfs_alloc_lookup_eq(cnt_cur, bestfbno, bestflen, + &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); + rlen = bestrlen; + rbno = bestrbno; + flen = bestflen; + fbno = bestfbno; + } + args->wasfromfl = 0; + /* + * Fix up the length. + */ + args->len = rlen; + if (rlen < args->minlen) { + if (!forced++) { + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); + trace_xfs_alloc_size_busy(args); + xfs_log_force(args->mp, XFS_LOG_SYNC); + goto restart; + } + goto out_nominleft; + } + xfs_alloc_fix_len(args); + + if (!xfs_alloc_fix_minleft(args)) + goto out_nominleft; + rlen = args->len; + XFS_WANT_CORRUPTED_GOTO(args->mp, rlen <= flen, error0); + /* + * Allocate and initialize a cursor for the by-block tree. + */ + bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, + args->agno, XFS_BTNUM_BNO); + if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, + rbno, rlen, XFSA_FIXUP_CNT_OK))) + goto error0; + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); + xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); + cnt_cur = bno_cur = NULL; + args->len = rlen; + args->agbno = rbno; + XFS_WANT_CORRUPTED_GOTO(args->mp, + args->agbno + args->len <= + be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length), + error0); + trace_xfs_alloc_size_done(args); + return 0; + +error0: + trace_xfs_alloc_size_error(args); + if (cnt_cur) + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR); + if (bno_cur) + xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR); + return error; + +out_nominleft: + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); + trace_xfs_alloc_size_nominleft(args); + args->agbno = NULLAGBLOCK; + return 0; +} + +/* + * Deal with the case where only small freespaces remain. + * Either return the contents of the last freespace record, + * or allocate space from the freelist if there is nothing in the tree. + */ +STATIC int /* error */ +xfs_alloc_ag_vextent_small( + xfs_alloc_arg_t *args, /* allocation argument structure */ + xfs_btree_cur_t *ccur, /* by-size cursor */ + xfs_agblock_t *fbnop, /* result block number */ + xfs_extlen_t *flenp, /* result length */ + int *stat) /* status: 0-freelist, 1-normal/none */ +{ + int error; + xfs_agblock_t fbno; + xfs_extlen_t flen; + int i; + + if ((error = xfs_btree_decrement(ccur, 0, &i))) + goto error0; + if (i) { + if ((error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); + } + /* + * Nothing in the btree, try the freelist. Make sure + * to respect minleft even when pulling from the + * freelist. + */ + else if (args->minlen == 1 && args->alignment == 1 && !args->isfl && + (be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_flcount) + > args->minleft)) { + error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno, 0); + if (error) + goto error0; + if (fbno != NULLAGBLOCK) { + xfs_extent_busy_reuse(args->mp, args->agno, fbno, 1, + args->userdata); + + if (args->userdata) { + xfs_buf_t *bp; + + bp = xfs_btree_get_bufs(args->mp, args->tp, + args->agno, fbno, 0); + xfs_trans_binval(args->tp, bp); + } + args->len = 1; + args->agbno = fbno; + XFS_WANT_CORRUPTED_GOTO(args->mp, + args->agbno + args->len <= + be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length), + error0); + args->wasfromfl = 1; + trace_xfs_alloc_small_freelist(args); + *stat = 0; + return 0; + } + /* + * Nothing in the freelist. + */ + else + flen = 0; + } + /* + * Can't allocate from the freelist for some reason. + */ + else { + fbno = NULLAGBLOCK; + flen = 0; + } + /* + * Can't do the allocation, give up. + */ + if (flen < args->minlen) { + args->agbno = NULLAGBLOCK; + trace_xfs_alloc_small_notenough(args); + flen = 0; + } + *fbnop = fbno; + *flenp = flen; + *stat = 1; + trace_xfs_alloc_small_done(args); + return 0; + +error0: + trace_xfs_alloc_small_error(args); + return error; +} + +/* + * Free the extent starting at agno/bno for length. + */ +STATIC int /* error */ +xfs_free_ag_extent( + xfs_trans_t *tp, /* transaction pointer */ + xfs_buf_t *agbp, /* buffer for a.g. freelist header */ + xfs_agnumber_t agno, /* allocation group number */ + xfs_agblock_t bno, /* starting block number */ + xfs_extlen_t len, /* length of extent */ + int isfl) /* set if is freelist blocks - no sb acctg */ +{ + xfs_btree_cur_t *bno_cur; /* cursor for by-block btree */ + xfs_btree_cur_t *cnt_cur; /* cursor for by-size btree */ + int error; /* error return value */ + xfs_agblock_t gtbno; /* start of right neighbor block */ + xfs_extlen_t gtlen; /* length of right neighbor block */ + int haveleft; /* have a left neighbor block */ + int haveright; /* have a right neighbor block */ + int i; /* temp, result code */ + xfs_agblock_t ltbno; /* start of left neighbor block */ + xfs_extlen_t ltlen; /* length of left neighbor block */ + xfs_mount_t *mp; /* mount point struct for filesystem */ + xfs_agblock_t nbno; /* new starting block of freespace */ + xfs_extlen_t nlen; /* new length of freespace */ + xfs_perag_t *pag; /* per allocation group data */ + + mp = tp->t_mountp; + /* + * Allocate and initialize a cursor for the by-block btree. + */ + bno_cur = xfs_allocbt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_BNO); + cnt_cur = NULL; + /* + * Look for a neighboring block on the left (lower block numbers) + * that is contiguous with this space. + */ + if ((error = xfs_alloc_lookup_le(bno_cur, bno, len, &haveleft))) + goto error0; + if (haveleft) { + /* + * There is a block to our left. + */ + if ((error = xfs_alloc_get_rec(bno_cur, <bno, <len, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + /* + * It's not contiguous, though. + */ + if (ltbno + ltlen < bno) + haveleft = 0; + else { + /* + * If this failure happens the request to free this + * space was invalid, it's (partly) already free. + * Very bad. + */ + XFS_WANT_CORRUPTED_GOTO(mp, + ltbno + ltlen <= bno, error0); + } + } + /* + * Look for a neighboring block on the right (higher block numbers) + * that is contiguous with this space. + */ + if ((error = xfs_btree_increment(bno_cur, 0, &haveright))) + goto error0; + if (haveright) { + /* + * There is a block to our right. + */ + if ((error = xfs_alloc_get_rec(bno_cur, >bno, >len, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + /* + * It's not contiguous, though. + */ + if (bno + len < gtbno) + haveright = 0; + else { + /* + * If this failure happens the request to free this + * space was invalid, it's (partly) already free. + * Very bad. + */ + XFS_WANT_CORRUPTED_GOTO(mp, gtbno >= bno + len, error0); + } + } + /* + * Now allocate and initialize a cursor for the by-size tree. + */ + cnt_cur = xfs_allocbt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_CNT); + /* + * Have both left and right contiguous neighbors. + * Merge all three into a single free block. + */ + if (haveleft && haveright) { + /* + * Delete the old by-size entry on the left. + */ + if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + if ((error = xfs_btree_delete(cnt_cur, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + /* + * Delete the old by-size entry on the right. + */ + if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + if ((error = xfs_btree_delete(cnt_cur, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + /* + * Delete the old by-block entry for the right block. + */ + if ((error = xfs_btree_delete(bno_cur, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + /* + * Move the by-block cursor back to the left neighbor. + */ + if ((error = xfs_btree_decrement(bno_cur, 0, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); +#ifdef DEBUG + /* + * Check that this is the right record: delete didn't + * mangle the cursor. + */ + { + xfs_agblock_t xxbno; + xfs_extlen_t xxlen; + + if ((error = xfs_alloc_get_rec(bno_cur, &xxbno, &xxlen, + &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(mp, + i == 1 && xxbno == ltbno && xxlen == ltlen, + error0); + } +#endif + /* + * Update remaining by-block entry to the new, joined block. + */ + nbno = ltbno; + nlen = len + ltlen + gtlen; + if ((error = xfs_alloc_update(bno_cur, nbno, nlen))) + goto error0; + } + /* + * Have only a left contiguous neighbor. + * Merge it together with the new freespace. + */ + else if (haveleft) { + /* + * Delete the old by-size entry on the left. + */ + if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + if ((error = xfs_btree_delete(cnt_cur, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + /* + * Back up the by-block cursor to the left neighbor, and + * update its length. + */ + if ((error = xfs_btree_decrement(bno_cur, 0, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + nbno = ltbno; + nlen = len + ltlen; + if ((error = xfs_alloc_update(bno_cur, nbno, nlen))) + goto error0; + } + /* + * Have only a right contiguous neighbor. + * Merge it together with the new freespace. + */ + else if (haveright) { + /* + * Delete the old by-size entry on the right. + */ + if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + if ((error = xfs_btree_delete(cnt_cur, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + /* + * Update the starting block and length of the right + * neighbor in the by-block tree. + */ + nbno = bno; + nlen = len + gtlen; + if ((error = xfs_alloc_update(bno_cur, nbno, nlen))) + goto error0; + } + /* + * No contiguous neighbors. + * Insert the new freespace into the by-block tree. + */ + else { + nbno = bno; + nlen = len; + if ((error = xfs_btree_insert(bno_cur, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + } + xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); + bno_cur = NULL; + /* + * In all cases we need to insert the new freespace in the by-size tree. + */ + if ((error = xfs_alloc_lookup_eq(cnt_cur, nbno, nlen, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(mp, i == 0, error0); + if ((error = xfs_btree_insert(cnt_cur, &i))) + goto error0; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); + cnt_cur = NULL; + + /* + * Update the freespace totals in the ag and superblock. + */ + pag = xfs_perag_get(mp, agno); + error = xfs_alloc_update_counters(tp, pag, agbp, len); + xfs_perag_put(pag); + if (error) + goto error0; + + if (!isfl) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (long)len); + XFS_STATS_INC(xs_freex); + XFS_STATS_ADD(xs_freeb, len); + + trace_xfs_free_extent(mp, agno, bno, len, isfl, haveleft, haveright); + + return 0; + + error0: + trace_xfs_free_extent(mp, agno, bno, len, isfl, -1, -1); + if (bno_cur) + xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR); + if (cnt_cur) + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR); + return error; +} + +/* + * Visible (exported) allocation/free functions. + * Some of these are used just by xfs_alloc_btree.c and this file. + */ + +/* + * Compute and fill in value of m_ag_maxlevels. + */ +void +xfs_alloc_compute_maxlevels( + xfs_mount_t *mp) /* file system mount structure */ +{ + int level; + uint maxblocks; + uint maxleafents; + int minleafrecs; + int minnoderecs; + + maxleafents = (mp->m_sb.sb_agblocks + 1) / 2; + minleafrecs = mp->m_alloc_mnr[0]; + minnoderecs = mp->m_alloc_mnr[1]; + maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs; + for (level = 1; maxblocks > 1; level++) + maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs; + mp->m_ag_maxlevels = level; +} + +/* + * Find the length of the longest extent in an AG. + */ +xfs_extlen_t +xfs_alloc_longest_free_extent( + struct xfs_mount *mp, + struct xfs_perag *pag) +{ + xfs_extlen_t need, delta = 0; + + need = XFS_MIN_FREELIST_PAG(pag, mp); + if (need > pag->pagf_flcount) + delta = need - pag->pagf_flcount; + + if (pag->pagf_longest > delta) + return pag->pagf_longest - delta; + return pag->pagf_flcount > 0 || pag->pagf_longest > 0; +} + +/* + * Decide whether to use this allocation group for this allocation. + * If so, fix up the btree freelist's size. + */ +STATIC int /* error */ +xfs_alloc_fix_freelist( + xfs_alloc_arg_t *args, /* allocation argument structure */ + int flags) /* XFS_ALLOC_FLAG_... */ +{ + xfs_buf_t *agbp; /* agf buffer pointer */ + xfs_agf_t *agf; /* a.g. freespace structure pointer */ + xfs_buf_t *agflbp;/* agfl buffer pointer */ + xfs_agblock_t bno; /* freelist block */ + xfs_extlen_t delta; /* new blocks needed in freelist */ + int error; /* error result code */ + xfs_extlen_t longest;/* longest extent in allocation group */ + xfs_mount_t *mp; /* file system mount point structure */ + xfs_extlen_t need; /* total blocks needed in freelist */ + xfs_perag_t *pag; /* per-ag information structure */ + xfs_alloc_arg_t targs; /* local allocation arguments */ + xfs_trans_t *tp; /* transaction pointer */ + + mp = args->mp; + + pag = args->pag; + tp = args->tp; + if (!pag->pagf_init) { + if ((error = xfs_alloc_read_agf(mp, tp, args->agno, flags, + &agbp))) + return error; + if (!pag->pagf_init) { + ASSERT(flags & XFS_ALLOC_FLAG_TRYLOCK); + ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING)); + args->agbp = NULL; + return 0; + } + } else + agbp = NULL; + + /* + * If this is a metadata preferred pag and we are user data + * then try somewhere else if we are not being asked to + * try harder at this point + */ + if (pag->pagf_metadata && args->userdata && + (flags & XFS_ALLOC_FLAG_TRYLOCK)) { + ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING)); + args->agbp = NULL; + return 0; + } + + if (!(flags & XFS_ALLOC_FLAG_FREEING)) { + /* + * If it looks like there isn't a long enough extent, or enough + * total blocks, reject it. + */ + need = XFS_MIN_FREELIST_PAG(pag, mp); + longest = xfs_alloc_longest_free_extent(mp, pag); + if ((args->minlen + args->alignment + args->minalignslop - 1) > + longest || + ((int)(pag->pagf_freeblks + pag->pagf_flcount - + need - args->total) < (int)args->minleft)) { + if (agbp) + xfs_trans_brelse(tp, agbp); + args->agbp = NULL; + return 0; + } + } + + /* + * Get the a.g. freespace buffer. + * Can fail if we're not blocking on locks, and it's held. + */ + if (agbp == NULL) { + if ((error = xfs_alloc_read_agf(mp, tp, args->agno, flags, + &agbp))) + return error; + if (agbp == NULL) { + ASSERT(flags & XFS_ALLOC_FLAG_TRYLOCK); + ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING)); + args->agbp = NULL; + return 0; + } + } + /* + * Figure out how many blocks we should have in the freelist. + */ + agf = XFS_BUF_TO_AGF(agbp); + need = XFS_MIN_FREELIST(agf, mp); + /* + * If there isn't enough total or single-extent, reject it. + */ + if (!(flags & XFS_ALLOC_FLAG_FREEING)) { + delta = need > be32_to_cpu(agf->agf_flcount) ? + (need - be32_to_cpu(agf->agf_flcount)) : 0; + longest = be32_to_cpu(agf->agf_longest); + longest = (longest > delta) ? (longest - delta) : + (be32_to_cpu(agf->agf_flcount) > 0 || longest > 0); + if ((args->minlen + args->alignment + args->minalignslop - 1) > + longest || + ((int)(be32_to_cpu(agf->agf_freeblks) + + be32_to_cpu(agf->agf_flcount) - need - args->total) < + (int)args->minleft)) { + xfs_trans_brelse(tp, agbp); + args->agbp = NULL; + return 0; + } + } + /* + * Make the freelist shorter if it's too long. + */ + while (be32_to_cpu(agf->agf_flcount) > need) { + xfs_buf_t *bp; + + error = xfs_alloc_get_freelist(tp, agbp, &bno, 0); + if (error) + return error; + if ((error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1, 1))) + return error; + bp = xfs_btree_get_bufs(mp, tp, args->agno, bno, 0); + xfs_trans_binval(tp, bp); + } + /* + * Initialize the args structure. + */ + memset(&targs, 0, sizeof(targs)); + targs.tp = tp; + targs.mp = mp; + targs.agbp = agbp; + targs.agno = args->agno; + targs.alignment = targs.minlen = targs.prod = targs.isfl = 1; + targs.type = XFS_ALLOCTYPE_THIS_AG; + targs.pag = pag; + if ((error = xfs_alloc_read_agfl(mp, tp, targs.agno, &agflbp))) + return error; + /* + * Make the freelist longer if it's too short. + */ + while (be32_to_cpu(agf->agf_flcount) < need) { + targs.agbno = 0; + targs.maxlen = need - be32_to_cpu(agf->agf_flcount); + /* + * Allocate as many blocks as possible at once. + */ + if ((error = xfs_alloc_ag_vextent(&targs))) { + xfs_trans_brelse(tp, agflbp); + return error; + } + /* + * Stop if we run out. Won't happen if callers are obeying + * the restrictions correctly. Can happen for free calls + * on a completely full ag. + */ + if (targs.agbno == NULLAGBLOCK) { + if (flags & XFS_ALLOC_FLAG_FREEING) + break; + xfs_trans_brelse(tp, agflbp); + args->agbp = NULL; + return 0; + } + /* + * Put each allocated block on the list. + */ + for (bno = targs.agbno; bno < targs.agbno + targs.len; bno++) { + error = xfs_alloc_put_freelist(tp, agbp, + agflbp, bno, 0); + if (error) + return error; + } + } + xfs_trans_brelse(tp, agflbp); + args->agbp = agbp; + return 0; +} + +/* + * Get a block from the freelist. + * Returns with the buffer for the block gotten. + */ +int /* error */ +xfs_alloc_get_freelist( + xfs_trans_t *tp, /* transaction pointer */ + xfs_buf_t *agbp, /* buffer containing the agf structure */ + xfs_agblock_t *bnop, /* block address retrieved from freelist */ + int btreeblk) /* destination is a AGF btree */ +{ + xfs_agf_t *agf; /* a.g. freespace structure */ + xfs_buf_t *agflbp;/* buffer for a.g. freelist structure */ + xfs_agblock_t bno; /* block number returned */ + __be32 *agfl_bno; + int error; + int logflags; + xfs_mount_t *mp = tp->t_mountp; + xfs_perag_t *pag; /* per allocation group data */ + + /* + * Freelist is empty, give up. + */ + agf = XFS_BUF_TO_AGF(agbp); + if (!agf->agf_flcount) { + *bnop = NULLAGBLOCK; + return 0; + } + /* + * Read the array of free blocks. + */ + error = xfs_alloc_read_agfl(mp, tp, be32_to_cpu(agf->agf_seqno), + &agflbp); + if (error) + return error; + + + /* + * Get the block number and update the data structures. + */ + agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp); + bno = be32_to_cpu(agfl_bno[be32_to_cpu(agf->agf_flfirst)]); + be32_add_cpu(&agf->agf_flfirst, 1); + xfs_trans_brelse(tp, agflbp); + if (be32_to_cpu(agf->agf_flfirst) == XFS_AGFL_SIZE(mp)) + agf->agf_flfirst = 0; + + pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno)); + be32_add_cpu(&agf->agf_flcount, -1); + xfs_trans_agflist_delta(tp, -1); + pag->pagf_flcount--; + xfs_perag_put(pag); + + logflags = XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT; + if (btreeblk) { + be32_add_cpu(&agf->agf_btreeblks, 1); + pag->pagf_btreeblks++; + logflags |= XFS_AGF_BTREEBLKS; + } + + xfs_alloc_log_agf(tp, agbp, logflags); + *bnop = bno; + + return 0; +} + +/* + * Log the given fields from the agf structure. + */ +void +xfs_alloc_log_agf( + xfs_trans_t *tp, /* transaction pointer */ + xfs_buf_t *bp, /* buffer for a.g. freelist header */ + int fields) /* mask of fields to be logged (XFS_AGF_...) */ +{ + int first; /* first byte offset */ + int last; /* last byte offset */ + static const short offsets[] = { + offsetof(xfs_agf_t, agf_magicnum), + offsetof(xfs_agf_t, agf_versionnum), + offsetof(xfs_agf_t, agf_seqno), + offsetof(xfs_agf_t, agf_length), + offsetof(xfs_agf_t, agf_roots[0]), + offsetof(xfs_agf_t, agf_levels[0]), + offsetof(xfs_agf_t, agf_flfirst), + offsetof(xfs_agf_t, agf_fllast), + offsetof(xfs_agf_t, agf_flcount), + offsetof(xfs_agf_t, agf_freeblks), + offsetof(xfs_agf_t, agf_longest), + offsetof(xfs_agf_t, agf_btreeblks), + offsetof(xfs_agf_t, agf_uuid), + sizeof(xfs_agf_t) + }; + + trace_xfs_agf(tp->t_mountp, XFS_BUF_TO_AGF(bp), fields, _RET_IP_); + + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGF_BUF); + + xfs_btree_offsets(fields, offsets, XFS_AGF_NUM_BITS, &first, &last); + xfs_trans_log_buf(tp, bp, (uint)first, (uint)last); +} + +/* + * Interface for inode allocation to force the pag data to be initialized. + */ +int /* error */ +xfs_alloc_pagf_init( + xfs_mount_t *mp, /* file system mount structure */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_agnumber_t agno, /* allocation group number */ + int flags) /* XFS_ALLOC_FLAGS_... */ +{ + xfs_buf_t *bp; + int error; + + if ((error = xfs_alloc_read_agf(mp, tp, agno, flags, &bp))) + return error; + if (bp) + xfs_trans_brelse(tp, bp); + return 0; +} + +/* + * Put the block on the freelist for the allocation group. + */ +int /* error */ +xfs_alloc_put_freelist( + xfs_trans_t *tp, /* transaction pointer */ + xfs_buf_t *agbp, /* buffer for a.g. freelist header */ + xfs_buf_t *agflbp,/* buffer for a.g. free block array */ + xfs_agblock_t bno, /* block being freed */ + int btreeblk) /* block came from a AGF btree */ +{ + xfs_agf_t *agf; /* a.g. freespace structure */ + __be32 *blockp;/* pointer to array entry */ + int error; + int logflags; + xfs_mount_t *mp; /* mount structure */ + xfs_perag_t *pag; /* per allocation group data */ + __be32 *agfl_bno; + int startoff; + + agf = XFS_BUF_TO_AGF(agbp); + mp = tp->t_mountp; + + if (!agflbp && (error = xfs_alloc_read_agfl(mp, tp, + be32_to_cpu(agf->agf_seqno), &agflbp))) + return error; + be32_add_cpu(&agf->agf_fllast, 1); + if (be32_to_cpu(agf->agf_fllast) == XFS_AGFL_SIZE(mp)) + agf->agf_fllast = 0; + + pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno)); + be32_add_cpu(&agf->agf_flcount, 1); + xfs_trans_agflist_delta(tp, 1); + pag->pagf_flcount++; + + logflags = XFS_AGF_FLLAST | XFS_AGF_FLCOUNT; + if (btreeblk) { + be32_add_cpu(&agf->agf_btreeblks, -1); + pag->pagf_btreeblks--; + logflags |= XFS_AGF_BTREEBLKS; + } + xfs_perag_put(pag); + + xfs_alloc_log_agf(tp, agbp, logflags); + + ASSERT(be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp)); + + agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp); + blockp = &agfl_bno[be32_to_cpu(agf->agf_fllast)]; + *blockp = cpu_to_be32(bno); + startoff = (char *)blockp - (char *)agflbp->b_addr; + + xfs_alloc_log_agf(tp, agbp, logflags); + + xfs_trans_buf_set_type(tp, agflbp, XFS_BLFT_AGFL_BUF); + xfs_trans_log_buf(tp, agflbp, startoff, + startoff + sizeof(xfs_agblock_t) - 1); + return 0; +} + +static bool +xfs_agf_verify( + struct xfs_mount *mp, + struct xfs_buf *bp) + { + struct xfs_agf *agf = XFS_BUF_TO_AGF(bp); + + if (xfs_sb_version_hascrc(&mp->m_sb) && + !uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_uuid)) + return false; + + if (!(agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) && + XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) && + be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) && + be32_to_cpu(agf->agf_flfirst) < XFS_AGFL_SIZE(mp) && + be32_to_cpu(agf->agf_fllast) < XFS_AGFL_SIZE(mp) && + be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp))) + return false; + + if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) > XFS_BTREE_MAXLEVELS || + be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) > XFS_BTREE_MAXLEVELS) + return false; + + /* + * during growfs operations, the perag is not fully initialised, + * so we can't use it for any useful checking. growfs ensures we can't + * use it by using uncached buffers that don't have the perag attached + * so we can detect and avoid this problem. + */ + if (bp->b_pag && be32_to_cpu(agf->agf_seqno) != bp->b_pag->pag_agno) + return false; + + if (xfs_sb_version_haslazysbcount(&mp->m_sb) && + be32_to_cpu(agf->agf_btreeblks) > be32_to_cpu(agf->agf_length)) + return false; + + return true;; + +} + +static void +xfs_agf_read_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + + if (xfs_sb_version_hascrc(&mp->m_sb) && + !xfs_buf_verify_cksum(bp, XFS_AGF_CRC_OFF)) + xfs_buf_ioerror(bp, -EFSBADCRC); + else if (XFS_TEST_ERROR(!xfs_agf_verify(mp, bp), mp, + XFS_ERRTAG_ALLOC_READ_AGF, + XFS_RANDOM_ALLOC_READ_AGF)) + xfs_buf_ioerror(bp, -EFSCORRUPTED); + + if (bp->b_error) + xfs_verifier_error(bp); +} + +static void +xfs_agf_write_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; + + if (!xfs_agf_verify(mp, bp)) { + xfs_buf_ioerror(bp, -EFSCORRUPTED); + xfs_verifier_error(bp); + return; + } + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (bip) + XFS_BUF_TO_AGF(bp)->agf_lsn = cpu_to_be64(bip->bli_item.li_lsn); + + xfs_buf_update_cksum(bp, XFS_AGF_CRC_OFF); +} + +const struct xfs_buf_ops xfs_agf_buf_ops = { + .verify_read = xfs_agf_read_verify, + .verify_write = xfs_agf_write_verify, +}; + +/* + * Read in the allocation group header (free/alloc section). + */ +int /* error */ +xfs_read_agf( + struct xfs_mount *mp, /* mount point structure */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_agnumber_t agno, /* allocation group number */ + int flags, /* XFS_BUF_ */ + struct xfs_buf **bpp) /* buffer for the ag freelist header */ +{ + int error; + + trace_xfs_read_agf(mp, agno); + + ASSERT(agno != NULLAGNUMBER); + error = xfs_trans_read_buf( + mp, tp, mp->m_ddev_targp, + XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)), + XFS_FSS_TO_BB(mp, 1), flags, bpp, &xfs_agf_buf_ops); + if (error) + return error; + if (!*bpp) + return 0; + + ASSERT(!(*bpp)->b_error); + xfs_buf_set_ref(*bpp, XFS_AGF_REF); + return 0; +} + +/* + * Read in the allocation group header (free/alloc section). + */ +int /* error */ +xfs_alloc_read_agf( + struct xfs_mount *mp, /* mount point structure */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_agnumber_t agno, /* allocation group number */ + int flags, /* XFS_ALLOC_FLAG_... */ + struct xfs_buf **bpp) /* buffer for the ag freelist header */ +{ + struct xfs_agf *agf; /* ag freelist header */ + struct xfs_perag *pag; /* per allocation group data */ + int error; + + trace_xfs_alloc_read_agf(mp, agno); + + ASSERT(agno != NULLAGNUMBER); + error = xfs_read_agf(mp, tp, agno, + (flags & XFS_ALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0, + bpp); + if (error) + return error; + if (!*bpp) + return 0; + ASSERT(!(*bpp)->b_error); + + agf = XFS_BUF_TO_AGF(*bpp); + pag = xfs_perag_get(mp, agno); + if (!pag->pagf_init) { + pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks); + pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks); + pag->pagf_flcount = be32_to_cpu(agf->agf_flcount); + pag->pagf_longest = be32_to_cpu(agf->agf_longest); + pag->pagf_levels[XFS_BTNUM_BNOi] = + be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]); + pag->pagf_levels[XFS_BTNUM_CNTi] = + be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]); + spin_lock_init(&pag->pagb_lock); + pag->pagb_count = 0; + pag->pagb_tree = RB_ROOT; + pag->pagf_init = 1; + } +#ifdef DEBUG + else if (!XFS_FORCED_SHUTDOWN(mp)) { + ASSERT(pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks)); + ASSERT(pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks)); + ASSERT(pag->pagf_flcount == be32_to_cpu(agf->agf_flcount)); + ASSERT(pag->pagf_longest == be32_to_cpu(agf->agf_longest)); + ASSERT(pag->pagf_levels[XFS_BTNUM_BNOi] == + be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi])); + ASSERT(pag->pagf_levels[XFS_BTNUM_CNTi] == + be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi])); + } +#endif + xfs_perag_put(pag); + return 0; +} + +/* + * Allocate an extent (variable-size). + * Depending on the allocation type, we either look in a single allocation + * group or loop over the allocation groups to find the result. + */ +int /* error */ +xfs_alloc_vextent( + xfs_alloc_arg_t *args) /* allocation argument structure */ +{ + xfs_agblock_t agsize; /* allocation group size */ + int error; + int flags; /* XFS_ALLOC_FLAG_... locking flags */ + xfs_extlen_t minleft;/* minimum left value, temp copy */ + xfs_mount_t *mp; /* mount structure pointer */ + xfs_agnumber_t sagno; /* starting allocation group number */ + xfs_alloctype_t type; /* input allocation type */ + int bump_rotor = 0; + int no_min = 0; + xfs_agnumber_t rotorstep = xfs_rotorstep; /* inode32 agf stepper */ + + mp = args->mp; + type = args->otype = args->type; + args->agbno = NULLAGBLOCK; + /* + * Just fix this up, for the case where the last a.g. is shorter + * (or there's only one a.g.) and the caller couldn't easily figure + * that out (xfs_bmap_alloc). + */ + agsize = mp->m_sb.sb_agblocks; + if (args->maxlen > agsize) + args->maxlen = agsize; + if (args->alignment == 0) + args->alignment = 1; + ASSERT(XFS_FSB_TO_AGNO(mp, args->fsbno) < mp->m_sb.sb_agcount); + ASSERT(XFS_FSB_TO_AGBNO(mp, args->fsbno) < agsize); + ASSERT(args->minlen <= args->maxlen); + ASSERT(args->minlen <= agsize); + ASSERT(args->mod < args->prod); + if (XFS_FSB_TO_AGNO(mp, args->fsbno) >= mp->m_sb.sb_agcount || + XFS_FSB_TO_AGBNO(mp, args->fsbno) >= agsize || + args->minlen > args->maxlen || args->minlen > agsize || + args->mod >= args->prod) { + args->fsbno = NULLFSBLOCK; + trace_xfs_alloc_vextent_badargs(args); + return 0; + } + minleft = args->minleft; + + switch (type) { + case XFS_ALLOCTYPE_THIS_AG: + case XFS_ALLOCTYPE_NEAR_BNO: + case XFS_ALLOCTYPE_THIS_BNO: + /* + * These three force us into a single a.g. + */ + args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno); + args->pag = xfs_perag_get(mp, args->agno); + args->minleft = 0; + error = xfs_alloc_fix_freelist(args, 0); + args->minleft = minleft; + if (error) { + trace_xfs_alloc_vextent_nofix(args); + goto error0; + } + if (!args->agbp) { + trace_xfs_alloc_vextent_noagbp(args); + break; + } + args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno); + if ((error = xfs_alloc_ag_vextent(args))) + goto error0; + break; + case XFS_ALLOCTYPE_START_BNO: + /* + * Try near allocation first, then anywhere-in-ag after + * the first a.g. fails. + */ + if ((args->userdata == XFS_ALLOC_INITIAL_USER_DATA) && + (mp->m_flags & XFS_MOUNT_32BITINODES)) { + args->fsbno = XFS_AGB_TO_FSB(mp, + ((mp->m_agfrotor / rotorstep) % + mp->m_sb.sb_agcount), 0); + bump_rotor = 1; + } + args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno); + args->type = XFS_ALLOCTYPE_NEAR_BNO; + /* FALLTHROUGH */ + case XFS_ALLOCTYPE_ANY_AG: + case XFS_ALLOCTYPE_START_AG: + case XFS_ALLOCTYPE_FIRST_AG: + /* + * Rotate through the allocation groups looking for a winner. + */ + if (type == XFS_ALLOCTYPE_ANY_AG) { + /* + * Start with the last place we left off. + */ + args->agno = sagno = (mp->m_agfrotor / rotorstep) % + mp->m_sb.sb_agcount; + args->type = XFS_ALLOCTYPE_THIS_AG; + flags = XFS_ALLOC_FLAG_TRYLOCK; + } else if (type == XFS_ALLOCTYPE_FIRST_AG) { + /* + * Start with allocation group given by bno. + */ + args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno); + args->type = XFS_ALLOCTYPE_THIS_AG; + sagno = 0; + flags = 0; + } else { + if (type == XFS_ALLOCTYPE_START_AG) + args->type = XFS_ALLOCTYPE_THIS_AG; + /* + * Start with the given allocation group. + */ + args->agno = sagno = XFS_FSB_TO_AGNO(mp, args->fsbno); + flags = XFS_ALLOC_FLAG_TRYLOCK; + } + /* + * Loop over allocation groups twice; first time with + * trylock set, second time without. + */ + for (;;) { + args->pag = xfs_perag_get(mp, args->agno); + if (no_min) args->minleft = 0; + error = xfs_alloc_fix_freelist(args, flags); + args->minleft = minleft; + if (error) { + trace_xfs_alloc_vextent_nofix(args); + goto error0; + } + /* + * If we get a buffer back then the allocation will fly. + */ + if (args->agbp) { + if ((error = xfs_alloc_ag_vextent(args))) + goto error0; + break; + } + + trace_xfs_alloc_vextent_loopfailed(args); + + /* + * Didn't work, figure out the next iteration. + */ + if (args->agno == sagno && + type == XFS_ALLOCTYPE_START_BNO) + args->type = XFS_ALLOCTYPE_THIS_AG; + /* + * For the first allocation, we can try any AG to get + * space. However, if we already have allocated a + * block, we don't want to try AGs whose number is below + * sagno. Otherwise, we may end up with out-of-order + * locking of AGF, which might cause deadlock. + */ + if (++(args->agno) == mp->m_sb.sb_agcount) { + if (args->firstblock != NULLFSBLOCK) + args->agno = sagno; + else + args->agno = 0; + } + /* + * Reached the starting a.g., must either be done + * or switch to non-trylock mode. + */ + if (args->agno == sagno) { + if (no_min == 1) { + args->agbno = NULLAGBLOCK; + trace_xfs_alloc_vextent_allfailed(args); + break; + } + if (flags == 0) { + no_min = 1; + } else { + flags = 0; + if (type == XFS_ALLOCTYPE_START_BNO) { + args->agbno = XFS_FSB_TO_AGBNO(mp, + args->fsbno); + args->type = XFS_ALLOCTYPE_NEAR_BNO; + } + } + } + xfs_perag_put(args->pag); + } + if (bump_rotor || (type == XFS_ALLOCTYPE_ANY_AG)) { + if (args->agno == sagno) + mp->m_agfrotor = (mp->m_agfrotor + 1) % + (mp->m_sb.sb_agcount * rotorstep); + else + mp->m_agfrotor = (args->agno * rotorstep + 1) % + (mp->m_sb.sb_agcount * rotorstep); + } + break; + default: + ASSERT(0); + /* NOTREACHED */ + } + if (args->agbno == NULLAGBLOCK) + args->fsbno = NULLFSBLOCK; + else { + args->fsbno = XFS_AGB_TO_FSB(mp, args->agno, args->agbno); +#ifdef DEBUG + ASSERT(args->len >= args->minlen); + ASSERT(args->len <= args->maxlen); + ASSERT(args->agbno % args->alignment == 0); + XFS_AG_CHECK_DADDR(mp, XFS_FSB_TO_DADDR(mp, args->fsbno), + args->len); +#endif + } + xfs_perag_put(args->pag); + return 0; +error0: + xfs_perag_put(args->pag); + return error; +} + +/* + * Free an extent. + * Just break up the extent address and hand off to xfs_free_ag_extent + * after fixing up the freelist. + */ +int /* error */ +xfs_free_extent( + xfs_trans_t *tp, /* transaction pointer */ + xfs_fsblock_t bno, /* starting block number of extent */ + xfs_extlen_t len) /* length of extent */ +{ + xfs_alloc_arg_t args; + int error; + + ASSERT(len != 0); + memset(&args, 0, sizeof(xfs_alloc_arg_t)); + args.tp = tp; + args.mp = tp->t_mountp; + + /* + * validate that the block number is legal - the enables us to detect + * and handle a silent filesystem corruption rather than crashing. + */ + args.agno = XFS_FSB_TO_AGNO(args.mp, bno); + if (args.agno >= args.mp->m_sb.sb_agcount) + return -EFSCORRUPTED; + + args.agbno = XFS_FSB_TO_AGBNO(args.mp, bno); + if (args.agbno >= args.mp->m_sb.sb_agblocks) + return -EFSCORRUPTED; + + args.pag = xfs_perag_get(args.mp, args.agno); + ASSERT(args.pag); + + error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING); + if (error) + goto error0; + + /* validate the extent size is legal now we have the agf locked */ + if (args.agbno + len > + be32_to_cpu(XFS_BUF_TO_AGF(args.agbp)->agf_length)) { + error = -EFSCORRUPTED; + goto error0; + } + + error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0); + if (!error) + xfs_extent_busy_insert(tp, args.agno, args.agbno, len, 0); +error0: + xfs_perag_put(args.pag); + return error; +} diff --git a/kernel/fs/xfs/libxfs/xfs_alloc.h b/kernel/fs/xfs/libxfs/xfs_alloc.h new file mode 100644 index 000000000..d1b4b6a5c --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_alloc.h @@ -0,0 +1,237 @@ +/* + * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_ALLOC_H__ +#define __XFS_ALLOC_H__ + +struct xfs_buf; +struct xfs_btree_cur; +struct xfs_mount; +struct xfs_perag; +struct xfs_trans; + +extern struct workqueue_struct *xfs_alloc_wq; + +/* + * Freespace allocation types. Argument to xfs_alloc_[v]extent. + */ +#define XFS_ALLOCTYPE_ANY_AG 0x01 /* allocate anywhere, use rotor */ +#define XFS_ALLOCTYPE_FIRST_AG 0x02 /* ... start at ag 0 */ +#define XFS_ALLOCTYPE_START_AG 0x04 /* anywhere, start in this a.g. */ +#define XFS_ALLOCTYPE_THIS_AG 0x08 /* anywhere in this a.g. */ +#define XFS_ALLOCTYPE_START_BNO 0x10 /* near this block else anywhere */ +#define XFS_ALLOCTYPE_NEAR_BNO 0x20 /* in this a.g. and near this block */ +#define XFS_ALLOCTYPE_THIS_BNO 0x40 /* at exactly this block */ + +/* this should become an enum again when the tracing code is fixed */ +typedef unsigned int xfs_alloctype_t; + +#define XFS_ALLOC_TYPES \ + { XFS_ALLOCTYPE_ANY_AG, "ANY_AG" }, \ + { XFS_ALLOCTYPE_FIRST_AG, "FIRST_AG" }, \ + { XFS_ALLOCTYPE_START_AG, "START_AG" }, \ + { XFS_ALLOCTYPE_THIS_AG, "THIS_AG" }, \ + { XFS_ALLOCTYPE_START_BNO, "START_BNO" }, \ + { XFS_ALLOCTYPE_NEAR_BNO, "NEAR_BNO" }, \ + { XFS_ALLOCTYPE_THIS_BNO, "THIS_BNO" } + +/* + * Flags for xfs_alloc_fix_freelist. + */ +#define XFS_ALLOC_FLAG_TRYLOCK 0x00000001 /* use trylock for buffer locking */ +#define XFS_ALLOC_FLAG_FREEING 0x00000002 /* indicate caller is freeing extents*/ + +/* + * In order to avoid ENOSPC-related deadlock caused by + * out-of-order locking of AGF buffer (PV 947395), we place + * constraints on the relationship among actual allocations for + * data blocks, freelist blocks, and potential file data bmap + * btree blocks. However, these restrictions may result in no + * actual space allocated for a delayed extent, for example, a data + * block in a certain AG is allocated but there is no additional + * block for the additional bmap btree block due to a split of the + * bmap btree of the file. The result of this may lead to an + * infinite loop in xfssyncd when the file gets flushed to disk and + * all delayed extents need to be actually allocated. To get around + * this, we explicitly set aside a few blocks which will not be + * reserved in delayed allocation. Considering the minimum number of + * needed freelist blocks is 4 fsbs _per AG_, a potential split of file's bmap + * btree requires 1 fsb, so we set the number of set-aside blocks + * to 4 + 4*agcount. + */ +#define XFS_ALLOC_SET_ASIDE(mp) (4 + ((mp)->m_sb.sb_agcount * 4)) + +/* + * When deciding how much space to allocate out of an AG, we limit the + * allocation maximum size to the size the AG. However, we cannot use all the + * blocks in the AG - some are permanently used by metadata. These + * blocks are generally: + * - the AG superblock, AGF, AGI and AGFL + * - the AGF (bno and cnt) and AGI btree root blocks + * - 4 blocks on the AGFL according to XFS_ALLOC_SET_ASIDE() limits + * + * The AG headers are sector sized, so the amount of space they take up is + * dependent on filesystem geometry. The others are all single blocks. + */ +#define XFS_ALLOC_AG_MAX_USABLE(mp) \ + ((mp)->m_sb.sb_agblocks - XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)) - 7) + + +/* + * Argument structure for xfs_alloc routines. + * This is turned into a structure to avoid having 20 arguments passed + * down several levels of the stack. + */ +typedef struct xfs_alloc_arg { + struct xfs_trans *tp; /* transaction pointer */ + struct xfs_mount *mp; /* file system mount point */ + struct xfs_buf *agbp; /* buffer for a.g. freelist header */ + struct xfs_perag *pag; /* per-ag struct for this agno */ + xfs_fsblock_t fsbno; /* file system block number */ + xfs_agnumber_t agno; /* allocation group number */ + xfs_agblock_t agbno; /* allocation group-relative block # */ + xfs_extlen_t minlen; /* minimum size of extent */ + xfs_extlen_t maxlen; /* maximum size of extent */ + xfs_extlen_t mod; /* mod value for extent size */ + xfs_extlen_t prod; /* prod value for extent size */ + xfs_extlen_t minleft; /* min blocks must be left after us */ + xfs_extlen_t total; /* total blocks needed in xaction */ + xfs_extlen_t alignment; /* align answer to multiple of this */ + xfs_extlen_t minalignslop; /* slop for minlen+alignment calcs */ + xfs_extlen_t len; /* output: actual size of extent */ + xfs_alloctype_t type; /* allocation type XFS_ALLOCTYPE_... */ + xfs_alloctype_t otype; /* original allocation type */ + char wasdel; /* set if allocation was prev delayed */ + char wasfromfl; /* set if allocation is from freelist */ + char isfl; /* set if is freelist blocks - !acctg */ + char userdata; /* set if this is user data */ + xfs_fsblock_t firstblock; /* io first block allocated */ +} xfs_alloc_arg_t; + +/* + * Defines for userdata + */ +#define XFS_ALLOC_USERDATA 1 /* allocation is for user data*/ +#define XFS_ALLOC_INITIAL_USER_DATA 2 /* special case start of file */ + +/* + * Find the length of the longest extent in an AG. + */ +xfs_extlen_t +xfs_alloc_longest_free_extent(struct xfs_mount *mp, + struct xfs_perag *pag); + +/* + * Compute and fill in value of m_ag_maxlevels. + */ +void +xfs_alloc_compute_maxlevels( + struct xfs_mount *mp); /* file system mount structure */ + +/* + * Get a block from the freelist. + * Returns with the buffer for the block gotten. + */ +int /* error */ +xfs_alloc_get_freelist( + struct xfs_trans *tp, /* transaction pointer */ + struct xfs_buf *agbp, /* buffer containing the agf structure */ + xfs_agblock_t *bnop, /* block address retrieved from freelist */ + int btreeblk); /* destination is a AGF btree */ + +/* + * Log the given fields from the agf structure. + */ +void +xfs_alloc_log_agf( + struct xfs_trans *tp, /* transaction pointer */ + struct xfs_buf *bp, /* buffer for a.g. freelist header */ + int fields);/* mask of fields to be logged (XFS_AGF_...) */ + +/* + * Interface for inode allocation to force the pag data to be initialized. + */ +int /* error */ +xfs_alloc_pagf_init( + struct xfs_mount *mp, /* file system mount structure */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_agnumber_t agno, /* allocation group number */ + int flags); /* XFS_ALLOC_FLAGS_... */ + +/* + * Put the block on the freelist for the allocation group. + */ +int /* error */ +xfs_alloc_put_freelist( + struct xfs_trans *tp, /* transaction pointer */ + struct xfs_buf *agbp, /* buffer for a.g. freelist header */ + struct xfs_buf *agflbp,/* buffer for a.g. free block array */ + xfs_agblock_t bno, /* block being freed */ + int btreeblk); /* owner was a AGF btree */ + +/* + * Read in the allocation group header (free/alloc section). + */ +int /* error */ +xfs_alloc_read_agf( + struct xfs_mount *mp, /* mount point structure */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_agnumber_t agno, /* allocation group number */ + int flags, /* XFS_ALLOC_FLAG_... */ + struct xfs_buf **bpp); /* buffer for the ag freelist header */ + +/* + * Allocate an extent (variable-size). + */ +int /* error */ +xfs_alloc_vextent( + xfs_alloc_arg_t *args); /* allocation argument structure */ + +/* + * Free an extent. + */ +int /* error */ +xfs_free_extent( + struct xfs_trans *tp, /* transaction pointer */ + xfs_fsblock_t bno, /* starting block number of extent */ + xfs_extlen_t len); /* length of extent */ + +int /* error */ +xfs_alloc_lookup_le( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agblock_t bno, /* starting block of extent */ + xfs_extlen_t len, /* length of extent */ + int *stat); /* success/failure */ + +int /* error */ +xfs_alloc_lookup_ge( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agblock_t bno, /* starting block of extent */ + xfs_extlen_t len, /* length of extent */ + int *stat); /* success/failure */ + +int /* error */ +xfs_alloc_get_rec( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agblock_t *bno, /* output: starting block of extent */ + xfs_extlen_t *len, /* output: length of extent */ + int *stat); /* output: success/failure */ + +int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp, + xfs_agnumber_t agno, int flags, struct xfs_buf **bpp); + +#endif /* __XFS_ALLOC_H__ */ diff --git a/kernel/fs/xfs/libxfs/xfs_alloc_btree.c b/kernel/fs/xfs/libxfs/xfs_alloc_btree.c new file mode 100644 index 000000000..59d521c09 --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_alloc_btree.c @@ -0,0 +1,503 @@ +/* + * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_btree.h" +#include "xfs_alloc_btree.h" +#include "xfs_alloc.h" +#include "xfs_extent_busy.h" +#include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_cksum.h" +#include "xfs_trans.h" + + +STATIC struct xfs_btree_cur * +xfs_allocbt_dup_cursor( + struct xfs_btree_cur *cur) +{ + return xfs_allocbt_init_cursor(cur->bc_mp, cur->bc_tp, + cur->bc_private.a.agbp, cur->bc_private.a.agno, + cur->bc_btnum); +} + +STATIC void +xfs_allocbt_set_root( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, + int inc) +{ + struct xfs_buf *agbp = cur->bc_private.a.agbp; + struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno); + int btnum = cur->bc_btnum; + struct xfs_perag *pag = xfs_perag_get(cur->bc_mp, seqno); + + ASSERT(ptr->s != 0); + + agf->agf_roots[btnum] = ptr->s; + be32_add_cpu(&agf->agf_levels[btnum], inc); + pag->pagf_levels[btnum] += inc; + xfs_perag_put(pag); + + xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS); +} + +STATIC int +xfs_allocbt_alloc_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *start, + union xfs_btree_ptr *new, + int *stat) +{ + int error; + xfs_agblock_t bno; + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + + /* Allocate the new block from the freelist. If we can't, give up. */ + error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp, + &bno, 1); + if (error) { + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; + } + + if (bno == NULLAGBLOCK) { + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 0; + return 0; + } + + xfs_extent_busy_reuse(cur->bc_mp, cur->bc_private.a.agno, bno, 1, false); + + xfs_trans_agbtree_delta(cur->bc_tp, 1); + new->s = cpu_to_be32(bno); + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 1; + return 0; +} + +STATIC int +xfs_allocbt_free_block( + struct xfs_btree_cur *cur, + struct xfs_buf *bp) +{ + struct xfs_buf *agbp = cur->bc_private.a.agbp; + struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + xfs_agblock_t bno; + int error; + + bno = xfs_daddr_to_agbno(cur->bc_mp, XFS_BUF_ADDR(bp)); + error = xfs_alloc_put_freelist(cur->bc_tp, agbp, NULL, bno, 1); + if (error) + return error; + + xfs_extent_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1, + XFS_EXTENT_BUSY_SKIP_DISCARD); + xfs_trans_agbtree_delta(cur->bc_tp, -1); + + xfs_trans_binval(cur->bc_tp, bp); + return 0; +} + +/* + * Update the longest extent in the AGF + */ +STATIC void +xfs_allocbt_update_lastrec( + struct xfs_btree_cur *cur, + struct xfs_btree_block *block, + union xfs_btree_rec *rec, + int ptr, + int reason) +{ + struct xfs_agf *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp); + xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno); + struct xfs_perag *pag; + __be32 len; + int numrecs; + + ASSERT(cur->bc_btnum == XFS_BTNUM_CNT); + + switch (reason) { + case LASTREC_UPDATE: + /* + * If this is the last leaf block and it's the last record, + * then update the size of the longest extent in the AG. + */ + if (ptr != xfs_btree_get_numrecs(block)) + return; + len = rec->alloc.ar_blockcount; + break; + case LASTREC_INSREC: + if (be32_to_cpu(rec->alloc.ar_blockcount) <= + be32_to_cpu(agf->agf_longest)) + return; + len = rec->alloc.ar_blockcount; + break; + case LASTREC_DELREC: + numrecs = xfs_btree_get_numrecs(block); + if (ptr <= numrecs) + return; + ASSERT(ptr == numrecs + 1); + + if (numrecs) { + xfs_alloc_rec_t *rrp; + + rrp = XFS_ALLOC_REC_ADDR(cur->bc_mp, block, numrecs); + len = rrp->ar_blockcount; + } else { + len = 0; + } + + break; + default: + ASSERT(0); + return; + } + + agf->agf_longest = len; + pag = xfs_perag_get(cur->bc_mp, seqno); + pag->pagf_longest = be32_to_cpu(len); + xfs_perag_put(pag); + xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp, XFS_AGF_LONGEST); +} + +STATIC int +xfs_allocbt_get_minrecs( + struct xfs_btree_cur *cur, + int level) +{ + return cur->bc_mp->m_alloc_mnr[level != 0]; +} + +STATIC int +xfs_allocbt_get_maxrecs( + struct xfs_btree_cur *cur, + int level) +{ + return cur->bc_mp->m_alloc_mxr[level != 0]; +} + +STATIC void +xfs_allocbt_init_key_from_rec( + union xfs_btree_key *key, + union xfs_btree_rec *rec) +{ + ASSERT(rec->alloc.ar_startblock != 0); + + key->alloc.ar_startblock = rec->alloc.ar_startblock; + key->alloc.ar_blockcount = rec->alloc.ar_blockcount; +} + +STATIC void +xfs_allocbt_init_rec_from_key( + union xfs_btree_key *key, + union xfs_btree_rec *rec) +{ + ASSERT(key->alloc.ar_startblock != 0); + + rec->alloc.ar_startblock = key->alloc.ar_startblock; + rec->alloc.ar_blockcount = key->alloc.ar_blockcount; +} + +STATIC void +xfs_allocbt_init_rec_from_cur( + struct xfs_btree_cur *cur, + union xfs_btree_rec *rec) +{ + ASSERT(cur->bc_rec.a.ar_startblock != 0); + + rec->alloc.ar_startblock = cpu_to_be32(cur->bc_rec.a.ar_startblock); + rec->alloc.ar_blockcount = cpu_to_be32(cur->bc_rec.a.ar_blockcount); +} + +STATIC void +xfs_allocbt_init_ptr_from_cur( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr) +{ + struct xfs_agf *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp); + + ASSERT(cur->bc_private.a.agno == be32_to_cpu(agf->agf_seqno)); + ASSERT(agf->agf_roots[cur->bc_btnum] != 0); + + ptr->s = agf->agf_roots[cur->bc_btnum]; +} + +STATIC __int64_t +xfs_allocbt_key_diff( + struct xfs_btree_cur *cur, + union xfs_btree_key *key) +{ + xfs_alloc_rec_incore_t *rec = &cur->bc_rec.a; + xfs_alloc_key_t *kp = &key->alloc; + __int64_t diff; + + if (cur->bc_btnum == XFS_BTNUM_BNO) { + return (__int64_t)be32_to_cpu(kp->ar_startblock) - + rec->ar_startblock; + } + + diff = (__int64_t)be32_to_cpu(kp->ar_blockcount) - rec->ar_blockcount; + if (diff) + return diff; + + return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock; +} + +static bool +xfs_allocbt_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + struct xfs_perag *pag = bp->b_pag; + unsigned int level; + + /* + * magic number and level verification + * + * During growfs operations, we can't verify the exact level or owner as + * the perag is not fully initialised and hence not attached to the + * buffer. In this case, check against the maximum tree depth. + * + * Similarly, during log recovery we will have a perag structure + * attached, but the agf information will not yet have been initialised + * from the on disk AGF. Again, we can only check against maximum limits + * in this case. + */ + level = be16_to_cpu(block->bb_level); + switch (block->bb_magic) { + case cpu_to_be32(XFS_ABTB_CRC_MAGIC): + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return false; + if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid)) + return false; + if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn)) + return false; + if (pag && + be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno) + return false; + /* fall through */ + case cpu_to_be32(XFS_ABTB_MAGIC): + if (pag && pag->pagf_init) { + if (level >= pag->pagf_levels[XFS_BTNUM_BNOi]) + return false; + } else if (level >= mp->m_ag_maxlevels) + return false; + break; + case cpu_to_be32(XFS_ABTC_CRC_MAGIC): + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return false; + if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid)) + return false; + if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn)) + return false; + if (pag && + be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno) + return false; + /* fall through */ + case cpu_to_be32(XFS_ABTC_MAGIC): + if (pag && pag->pagf_init) { + if (level >= pag->pagf_levels[XFS_BTNUM_CNTi]) + return false; + } else if (level >= mp->m_ag_maxlevels) + return false; + break; + default: + return false; + } + + /* numrecs verification */ + if (be16_to_cpu(block->bb_numrecs) > mp->m_alloc_mxr[level != 0]) + return false; + + /* sibling pointer verification */ + if (!block->bb_u.s.bb_leftsib || + (be32_to_cpu(block->bb_u.s.bb_leftsib) >= mp->m_sb.sb_agblocks && + block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK))) + return false; + if (!block->bb_u.s.bb_rightsib || + (be32_to_cpu(block->bb_u.s.bb_rightsib) >= mp->m_sb.sb_agblocks && + block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK))) + return false; + + return true; +} + +static void +xfs_allocbt_read_verify( + struct xfs_buf *bp) +{ + if (!xfs_btree_sblock_verify_crc(bp)) + xfs_buf_ioerror(bp, -EFSBADCRC); + else if (!xfs_allocbt_verify(bp)) + xfs_buf_ioerror(bp, -EFSCORRUPTED); + + if (bp->b_error) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + xfs_verifier_error(bp); + } +} + +static void +xfs_allocbt_write_verify( + struct xfs_buf *bp) +{ + if (!xfs_allocbt_verify(bp)) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + xfs_buf_ioerror(bp, -EFSCORRUPTED); + xfs_verifier_error(bp); + return; + } + xfs_btree_sblock_calc_crc(bp); + +} + +const struct xfs_buf_ops xfs_allocbt_buf_ops = { + .verify_read = xfs_allocbt_read_verify, + .verify_write = xfs_allocbt_write_verify, +}; + + +#if defined(DEBUG) || defined(XFS_WARN) +STATIC int +xfs_allocbt_keys_inorder( + struct xfs_btree_cur *cur, + union xfs_btree_key *k1, + union xfs_btree_key *k2) +{ + if (cur->bc_btnum == XFS_BTNUM_BNO) { + return be32_to_cpu(k1->alloc.ar_startblock) < + be32_to_cpu(k2->alloc.ar_startblock); + } else { + return be32_to_cpu(k1->alloc.ar_blockcount) < + be32_to_cpu(k2->alloc.ar_blockcount) || + (k1->alloc.ar_blockcount == k2->alloc.ar_blockcount && + be32_to_cpu(k1->alloc.ar_startblock) < + be32_to_cpu(k2->alloc.ar_startblock)); + } +} + +STATIC int +xfs_allocbt_recs_inorder( + struct xfs_btree_cur *cur, + union xfs_btree_rec *r1, + union xfs_btree_rec *r2) +{ + if (cur->bc_btnum == XFS_BTNUM_BNO) { + return be32_to_cpu(r1->alloc.ar_startblock) + + be32_to_cpu(r1->alloc.ar_blockcount) <= + be32_to_cpu(r2->alloc.ar_startblock); + } else { + return be32_to_cpu(r1->alloc.ar_blockcount) < + be32_to_cpu(r2->alloc.ar_blockcount) || + (r1->alloc.ar_blockcount == r2->alloc.ar_blockcount && + be32_to_cpu(r1->alloc.ar_startblock) < + be32_to_cpu(r2->alloc.ar_startblock)); + } +} +#endif /* DEBUG */ + +static const struct xfs_btree_ops xfs_allocbt_ops = { + .rec_len = sizeof(xfs_alloc_rec_t), + .key_len = sizeof(xfs_alloc_key_t), + + .dup_cursor = xfs_allocbt_dup_cursor, + .set_root = xfs_allocbt_set_root, + .alloc_block = xfs_allocbt_alloc_block, + .free_block = xfs_allocbt_free_block, + .update_lastrec = xfs_allocbt_update_lastrec, + .get_minrecs = xfs_allocbt_get_minrecs, + .get_maxrecs = xfs_allocbt_get_maxrecs, + .init_key_from_rec = xfs_allocbt_init_key_from_rec, + .init_rec_from_key = xfs_allocbt_init_rec_from_key, + .init_rec_from_cur = xfs_allocbt_init_rec_from_cur, + .init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur, + .key_diff = xfs_allocbt_key_diff, + .buf_ops = &xfs_allocbt_buf_ops, +#if defined(DEBUG) || defined(XFS_WARN) + .keys_inorder = xfs_allocbt_keys_inorder, + .recs_inorder = xfs_allocbt_recs_inorder, +#endif +}; + +/* + * Allocate a new allocation btree cursor. + */ +struct xfs_btree_cur * /* new alloc btree cursor */ +xfs_allocbt_init_cursor( + struct xfs_mount *mp, /* file system mount point */ + struct xfs_trans *tp, /* transaction pointer */ + struct xfs_buf *agbp, /* buffer for agf structure */ + xfs_agnumber_t agno, /* allocation group number */ + xfs_btnum_t btnum) /* btree identifier */ +{ + struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + struct xfs_btree_cur *cur; + + ASSERT(btnum == XFS_BTNUM_BNO || btnum == XFS_BTNUM_CNT); + + cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP); + + cur->bc_tp = tp; + cur->bc_mp = mp; + cur->bc_btnum = btnum; + cur->bc_blocklog = mp->m_sb.sb_blocklog; + cur->bc_ops = &xfs_allocbt_ops; + + if (btnum == XFS_BTNUM_CNT) { + cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]); + cur->bc_flags = XFS_BTREE_LASTREC_UPDATE; + } else { + cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]); + } + + cur->bc_private.a.agbp = agbp; + cur->bc_private.a.agno = agno; + + if (xfs_sb_version_hascrc(&mp->m_sb)) + cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; + + return cur; +} + +/* + * Calculate number of records in an alloc btree block. + */ +int +xfs_allocbt_maxrecs( + struct xfs_mount *mp, + int blocklen, + int leaf) +{ + blocklen -= XFS_ALLOC_BLOCK_LEN(mp); + + if (leaf) + return blocklen / sizeof(xfs_alloc_rec_t); + return blocklen / (sizeof(xfs_alloc_key_t) + sizeof(xfs_alloc_ptr_t)); +} diff --git a/kernel/fs/xfs/libxfs/xfs_alloc_btree.h b/kernel/fs/xfs/libxfs/xfs_alloc_btree.h new file mode 100644 index 000000000..45e189e7e --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_alloc_btree.h @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2000,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_ALLOC_BTREE_H__ +#define __XFS_ALLOC_BTREE_H__ + +/* + * Freespace on-disk structures + */ + +struct xfs_buf; +struct xfs_btree_cur; +struct xfs_mount; + +/* + * Btree block header size depends on a superblock flag. + */ +#define XFS_ALLOC_BLOCK_LEN(mp) \ + (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \ + XFS_BTREE_SBLOCK_CRC_LEN : XFS_BTREE_SBLOCK_LEN) + +/* + * Record, key, and pointer address macros for btree blocks. + * + * (note that some of these may appear unused, but they are used in userspace) + */ +#define XFS_ALLOC_REC_ADDR(mp, block, index) \ + ((xfs_alloc_rec_t *) \ + ((char *)(block) + \ + XFS_ALLOC_BLOCK_LEN(mp) + \ + (((index) - 1) * sizeof(xfs_alloc_rec_t)))) + +#define XFS_ALLOC_KEY_ADDR(mp, block, index) \ + ((xfs_alloc_key_t *) \ + ((char *)(block) + \ + XFS_ALLOC_BLOCK_LEN(mp) + \ + ((index) - 1) * sizeof(xfs_alloc_key_t))) + +#define XFS_ALLOC_PTR_ADDR(mp, block, index, maxrecs) \ + ((xfs_alloc_ptr_t *) \ + ((char *)(block) + \ + XFS_ALLOC_BLOCK_LEN(mp) + \ + (maxrecs) * sizeof(xfs_alloc_key_t) + \ + ((index) - 1) * sizeof(xfs_alloc_ptr_t))) + +extern struct xfs_btree_cur *xfs_allocbt_init_cursor(struct xfs_mount *, + struct xfs_trans *, struct xfs_buf *, + xfs_agnumber_t, xfs_btnum_t); +extern int xfs_allocbt_maxrecs(struct xfs_mount *, int, int); + +#endif /* __XFS_ALLOC_BTREE_H__ */ diff --git a/kernel/fs/xfs/libxfs/xfs_attr.c b/kernel/fs/xfs/libxfs/xfs_attr.c new file mode 100644 index 000000000..0a472fbe0 --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_attr.c @@ -0,0 +1,1456 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_attr_sf.h" +#include "xfs_inode.h" +#include "xfs_alloc.h" +#include "xfs_trans.h" +#include "xfs_inode_item.h" +#include "xfs_bmap.h" +#include "xfs_bmap_util.h" +#include "xfs_bmap_btree.h" +#include "xfs_attr.h" +#include "xfs_attr_leaf.h" +#include "xfs_attr_remote.h" +#include "xfs_error.h" +#include "xfs_quota.h" +#include "xfs_trans_space.h" +#include "xfs_trace.h" + +/* + * xfs_attr.c + * + * Provide the external interfaces to manage attribute lists. + */ + +/*======================================================================== + * Function prototypes for the kernel. + *========================================================================*/ + +/* + * Internal routines when attribute list fits inside the inode. + */ +STATIC int xfs_attr_shortform_addname(xfs_da_args_t *args); + +/* + * Internal routines when attribute list is one block. + */ +STATIC int xfs_attr_leaf_get(xfs_da_args_t *args); +STATIC int xfs_attr_leaf_addname(xfs_da_args_t *args); +STATIC int xfs_attr_leaf_removename(xfs_da_args_t *args); + +/* + * Internal routines when attribute list is more than one block. + */ +STATIC int xfs_attr_node_get(xfs_da_args_t *args); +STATIC int xfs_attr_node_addname(xfs_da_args_t *args); +STATIC int xfs_attr_node_removename(xfs_da_args_t *args); +STATIC int xfs_attr_fillstate(xfs_da_state_t *state); +STATIC int xfs_attr_refillstate(xfs_da_state_t *state); + + +STATIC int +xfs_attr_args_init( + struct xfs_da_args *args, + struct xfs_inode *dp, + const unsigned char *name, + int flags) +{ + + if (!name) + return -EINVAL; + + memset(args, 0, sizeof(*args)); + args->geo = dp->i_mount->m_attr_geo; + args->whichfork = XFS_ATTR_FORK; + args->dp = dp; + args->flags = flags; + args->name = name; + args->namelen = strlen((const char *)name); + if (args->namelen >= MAXNAMELEN) + return -EFAULT; /* match IRIX behaviour */ + + args->hashval = xfs_da_hashname(args->name, args->namelen); + return 0; +} + +int +xfs_inode_hasattr( + struct xfs_inode *ip) +{ + if (!XFS_IFORK_Q(ip) || + (ip->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS && + ip->i_d.di_anextents == 0)) + return 0; + return 1; +} + +/*======================================================================== + * Overall external interface routines. + *========================================================================*/ + +int +xfs_attr_get( + struct xfs_inode *ip, + const unsigned char *name, + unsigned char *value, + int *valuelenp, + int flags) +{ + struct xfs_da_args args; + uint lock_mode; + int error; + + XFS_STATS_INC(xs_attr_get); + + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + return -EIO; + + if (!xfs_inode_hasattr(ip)) + return -ENOATTR; + + error = xfs_attr_args_init(&args, ip, name, flags); + if (error) + return error; + + args.value = value; + args.valuelen = *valuelenp; + + lock_mode = xfs_ilock_attr_map_shared(ip); + if (!xfs_inode_hasattr(ip)) + error = -ENOATTR; + else if (ip->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) + error = xfs_attr_shortform_getvalue(&args); + else if (xfs_bmap_one_block(ip, XFS_ATTR_FORK)) + error = xfs_attr_leaf_get(&args); + else + error = xfs_attr_node_get(&args); + xfs_iunlock(ip, lock_mode); + + *valuelenp = args.valuelen; + return error == -EEXIST ? 0 : error; +} + +/* + * Calculate how many blocks we need for the new attribute, + */ +STATIC int +xfs_attr_calc_size( + struct xfs_da_args *args, + int *local) +{ + struct xfs_mount *mp = args->dp->i_mount; + int size; + int nblks; + + /* + * Determine space new attribute will use, and if it would be + * "local" or "remote" (note: local != inline). + */ + size = xfs_attr_leaf_newentsize(args, local); + nblks = XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK); + if (*local) { + if (size > (args->geo->blksize / 2)) { + /* Double split possible */ + nblks *= 2; + } + } else { + /* + * Out of line attribute, cannot double split, but + * make room for the attribute value itself. + */ + uint dblocks = xfs_attr3_rmt_blocks(mp, args->valuelen); + nblks += dblocks; + nblks += XFS_NEXTENTADD_SPACE_RES(mp, dblocks, XFS_ATTR_FORK); + } + + return nblks; +} + +int +xfs_attr_set( + struct xfs_inode *dp, + const unsigned char *name, + unsigned char *value, + int valuelen, + int flags) +{ + struct xfs_mount *mp = dp->i_mount; + struct xfs_da_args args; + struct xfs_bmap_free flist; + struct xfs_trans_res tres; + xfs_fsblock_t firstblock; + int rsvd = (flags & ATTR_ROOT) != 0; + int error, err2, committed, local; + + XFS_STATS_INC(xs_attr_set); + + if (XFS_FORCED_SHUTDOWN(dp->i_mount)) + return -EIO; + + error = xfs_attr_args_init(&args, dp, name, flags); + if (error) + return error; + + args.value = value; + args.valuelen = valuelen; + args.firstblock = &firstblock; + args.flist = &flist; + args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT; + args.total = xfs_attr_calc_size(&args, &local); + + error = xfs_qm_dqattach(dp, 0); + if (error) + return error; + + /* + * If the inode doesn't have an attribute fork, add one. + * (inode must not be locked when we call this routine) + */ + if (XFS_IFORK_Q(dp) == 0) { + int sf_size = sizeof(xfs_attr_sf_hdr_t) + + XFS_ATTR_SF_ENTSIZE_BYNAME(args.namelen, valuelen); + + error = xfs_bmap_add_attrfork(dp, sf_size, rsvd); + if (error) + return error; + } + + /* + * Start our first transaction of the day. + * + * All future transactions during this code must be "chained" off + * this one via the trans_dup() call. All transactions will contain + * the inode, and the inode will always be marked with trans_ihold(). + * Since the inode will be locked in all transactions, we must log + * the inode in every transaction to let it float upward through + * the log. + */ + args.trans = xfs_trans_alloc(mp, XFS_TRANS_ATTR_SET); + + /* + * Root fork attributes can use reserved data blocks for this + * operation if necessary + */ + + if (rsvd) + args.trans->t_flags |= XFS_TRANS_RESERVE; + + tres.tr_logres = M_RES(mp)->tr_attrsetm.tr_logres + + M_RES(mp)->tr_attrsetrt.tr_logres * args.total; + tres.tr_logcount = XFS_ATTRSET_LOG_COUNT; + tres.tr_logflags = XFS_TRANS_PERM_LOG_RES; + error = xfs_trans_reserve(args.trans, &tres, args.total, 0); + if (error) { + xfs_trans_cancel(args.trans, 0); + return error; + } + xfs_ilock(dp, XFS_ILOCK_EXCL); + + error = xfs_trans_reserve_quota_nblks(args.trans, dp, args.total, 0, + rsvd ? XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES : + XFS_QMOPT_RES_REGBLKS); + if (error) { + xfs_iunlock(dp, XFS_ILOCK_EXCL); + xfs_trans_cancel(args.trans, XFS_TRANS_RELEASE_LOG_RES); + return error; + } + + xfs_trans_ijoin(args.trans, dp, 0); + + /* + * If the attribute list is non-existent or a shortform list, + * upgrade it to a single-leaf-block attribute list. + */ + if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL || + (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS && + dp->i_d.di_anextents == 0)) { + + /* + * Build initial attribute list (if required). + */ + if (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS) + xfs_attr_shortform_create(&args); + + /* + * Try to add the attr to the attribute list in + * the inode. + */ + error = xfs_attr_shortform_addname(&args); + if (error != -ENOSPC) { + /* + * Commit the shortform mods, and we're done. + * NOTE: this is also the error path (EEXIST, etc). + */ + ASSERT(args.trans != NULL); + + /* + * If this is a synchronous mount, make sure that + * the transaction goes to disk before returning + * to the user. + */ + if (mp->m_flags & XFS_MOUNT_WSYNC) + xfs_trans_set_sync(args.trans); + + if (!error && (flags & ATTR_KERNOTIME) == 0) { + xfs_trans_ichgtime(args.trans, dp, + XFS_ICHGTIME_CHG); + } + err2 = xfs_trans_commit(args.trans, + XFS_TRANS_RELEASE_LOG_RES); + xfs_iunlock(dp, XFS_ILOCK_EXCL); + + return error ? error : err2; + } + + /* + * It won't fit in the shortform, transform to a leaf block. + * GROT: another possible req'mt for a double-split btree op. + */ + xfs_bmap_init(args.flist, args.firstblock); + error = xfs_attr_shortform_to_leaf(&args); + if (!error) { + error = xfs_bmap_finish(&args.trans, args.flist, + &committed); + } + if (error) { + ASSERT(committed); + args.trans = NULL; + xfs_bmap_cancel(&flist); + goto out; + } + + /* + * bmap_finish() may have committed the last trans and started + * a new one. We need the inode to be in all transactions. + */ + if (committed) + xfs_trans_ijoin(args.trans, dp, 0); + + /* + * Commit the leaf transformation. We'll need another (linked) + * transaction to add the new attribute to the leaf. + */ + + error = xfs_trans_roll(&args.trans, dp); + if (error) + goto out; + + } + + if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) + error = xfs_attr_leaf_addname(&args); + else + error = xfs_attr_node_addname(&args); + if (error) + goto out; + + /* + * If this is a synchronous mount, make sure that the + * transaction goes to disk before returning to the user. + */ + if (mp->m_flags & XFS_MOUNT_WSYNC) + xfs_trans_set_sync(args.trans); + + if ((flags & ATTR_KERNOTIME) == 0) + xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG); + + /* + * Commit the last in the sequence of transactions. + */ + xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE); + error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES); + xfs_iunlock(dp, XFS_ILOCK_EXCL); + + return error; + +out: + if (args.trans) { + xfs_trans_cancel(args.trans, + XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); + } + xfs_iunlock(dp, XFS_ILOCK_EXCL); + return error; +} + +/* + * Generic handler routine to remove a name from an attribute list. + * Transitions attribute list from Btree to shortform as necessary. + */ +int +xfs_attr_remove( + struct xfs_inode *dp, + const unsigned char *name, + int flags) +{ + struct xfs_mount *mp = dp->i_mount; + struct xfs_da_args args; + struct xfs_bmap_free flist; + xfs_fsblock_t firstblock; + int error; + + XFS_STATS_INC(xs_attr_remove); + + if (XFS_FORCED_SHUTDOWN(dp->i_mount)) + return -EIO; + + if (!xfs_inode_hasattr(dp)) + return -ENOATTR; + + error = xfs_attr_args_init(&args, dp, name, flags); + if (error) + return error; + + args.firstblock = &firstblock; + args.flist = &flist; + + /* + * we have no control over the attribute names that userspace passes us + * to remove, so we have to allow the name lookup prior to attribute + * removal to fail. + */ + args.op_flags = XFS_DA_OP_OKNOENT; + + error = xfs_qm_dqattach(dp, 0); + if (error) + return error; + + /* + * Start our first transaction of the day. + * + * All future transactions during this code must be "chained" off + * this one via the trans_dup() call. All transactions will contain + * the inode, and the inode will always be marked with trans_ihold(). + * Since the inode will be locked in all transactions, we must log + * the inode in every transaction to let it float upward through + * the log. + */ + args.trans = xfs_trans_alloc(mp, XFS_TRANS_ATTR_RM); + + /* + * Root fork attributes can use reserved data blocks for this + * operation if necessary + */ + + if (flags & ATTR_ROOT) + args.trans->t_flags |= XFS_TRANS_RESERVE; + + error = xfs_trans_reserve(args.trans, &M_RES(mp)->tr_attrrm, + XFS_ATTRRM_SPACE_RES(mp), 0); + if (error) { + xfs_trans_cancel(args.trans, 0); + return error; + } + + xfs_ilock(dp, XFS_ILOCK_EXCL); + /* + * No need to make quota reservations here. We expect to release some + * blocks not allocate in the common case. + */ + xfs_trans_ijoin(args.trans, dp, 0); + + if (!xfs_inode_hasattr(dp)) { + error = -ENOATTR; + } else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { + ASSERT(dp->i_afp->if_flags & XFS_IFINLINE); + error = xfs_attr_shortform_remove(&args); + } else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) { + error = xfs_attr_leaf_removename(&args); + } else { + error = xfs_attr_node_removename(&args); + } + + if (error) + goto out; + + /* + * If this is a synchronous mount, make sure that the + * transaction goes to disk before returning to the user. + */ + if (mp->m_flags & XFS_MOUNT_WSYNC) + xfs_trans_set_sync(args.trans); + + if ((flags & ATTR_KERNOTIME) == 0) + xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG); + + /* + * Commit the last in the sequence of transactions. + */ + xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE); + error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES); + xfs_iunlock(dp, XFS_ILOCK_EXCL); + + return error; + +out: + if (args.trans) { + xfs_trans_cancel(args.trans, + XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); + } + xfs_iunlock(dp, XFS_ILOCK_EXCL); + return error; +} + +/*======================================================================== + * External routines when attribute list is inside the inode + *========================================================================*/ + +/* + * Add a name to the shortform attribute list structure + * This is the external routine. + */ +STATIC int +xfs_attr_shortform_addname(xfs_da_args_t *args) +{ + int newsize, forkoff, retval; + + trace_xfs_attr_sf_addname(args); + + retval = xfs_attr_shortform_lookup(args); + if ((args->flags & ATTR_REPLACE) && (retval == -ENOATTR)) { + return retval; + } else if (retval == -EEXIST) { + if (args->flags & ATTR_CREATE) + return retval; + retval = xfs_attr_shortform_remove(args); + ASSERT(retval == 0); + } + + if (args->namelen >= XFS_ATTR_SF_ENTSIZE_MAX || + args->valuelen >= XFS_ATTR_SF_ENTSIZE_MAX) + return -ENOSPC; + + newsize = XFS_ATTR_SF_TOTSIZE(args->dp); + newsize += XFS_ATTR_SF_ENTSIZE_BYNAME(args->namelen, args->valuelen); + + forkoff = xfs_attr_shortform_bytesfit(args->dp, newsize); + if (!forkoff) + return -ENOSPC; + + xfs_attr_shortform_add(args, forkoff); + return 0; +} + + +/*======================================================================== + * External routines when attribute list is one block + *========================================================================*/ + +/* + * Add a name to the leaf attribute list structure + * + * This leaf block cannot have a "remote" value, we only call this routine + * if bmap_one_block() says there is only one block (ie: no remote blks). + */ +STATIC int +xfs_attr_leaf_addname(xfs_da_args_t *args) +{ + xfs_inode_t *dp; + struct xfs_buf *bp; + int retval, error, committed, forkoff; + + trace_xfs_attr_leaf_addname(args); + + /* + * Read the (only) block in the attribute list in. + */ + dp = args->dp; + args->blkno = 0; + error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); + if (error) + return error; + + /* + * Look up the given attribute in the leaf block. Figure out if + * the given flags produce an error or call for an atomic rename. + */ + retval = xfs_attr3_leaf_lookup_int(bp, args); + if ((args->flags & ATTR_REPLACE) && (retval == -ENOATTR)) { + xfs_trans_brelse(args->trans, bp); + return retval; + } else if (retval == -EEXIST) { + if (args->flags & ATTR_CREATE) { /* pure create op */ + xfs_trans_brelse(args->trans, bp); + return retval; + } + + trace_xfs_attr_leaf_replace(args); + + /* save the attribute state for later removal*/ + args->op_flags |= XFS_DA_OP_RENAME; /* an atomic rename */ + args->blkno2 = args->blkno; /* set 2nd entry info*/ + args->index2 = args->index; + args->rmtblkno2 = args->rmtblkno; + args->rmtblkcnt2 = args->rmtblkcnt; + args->rmtvaluelen2 = args->rmtvaluelen; + + /* + * clear the remote attr state now that it is saved so that the + * values reflect the state of the attribute we are about to + * add, not the attribute we just found and will remove later. + */ + args->rmtblkno = 0; + args->rmtblkcnt = 0; + args->rmtvaluelen = 0; + } + + /* + * Add the attribute to the leaf block, transitioning to a Btree + * if required. + */ + retval = xfs_attr3_leaf_add(bp, args); + if (retval == -ENOSPC) { + /* + * Promote the attribute list to the Btree format, then + * Commit that transaction so that the node_addname() call + * can manage its own transactions. + */ + xfs_bmap_init(args->flist, args->firstblock); + error = xfs_attr3_leaf_to_node(args); + if (!error) { + error = xfs_bmap_finish(&args->trans, args->flist, + &committed); + } + if (error) { + ASSERT(committed); + args->trans = NULL; + xfs_bmap_cancel(args->flist); + return error; + } + + /* + * bmap_finish() may have committed the last trans and started + * a new one. We need the inode to be in all transactions. + */ + if (committed) + xfs_trans_ijoin(args->trans, dp, 0); + + /* + * Commit the current trans (including the inode) and start + * a new one. + */ + error = xfs_trans_roll(&args->trans, dp); + if (error) + return error; + + /* + * Fob the whole rest of the problem off on the Btree code. + */ + error = xfs_attr_node_addname(args); + return error; + } + + /* + * Commit the transaction that added the attr name so that + * later routines can manage their own transactions. + */ + error = xfs_trans_roll(&args->trans, dp); + if (error) + return error; + + /* + * If there was an out-of-line value, allocate the blocks we + * identified for its storage and copy the value. This is done + * after we create the attribute so that we don't overflow the + * maximum size of a transaction and/or hit a deadlock. + */ + if (args->rmtblkno > 0) { + error = xfs_attr_rmtval_set(args); + if (error) + return error; + } + + /* + * If this is an atomic rename operation, we must "flip" the + * incomplete flags on the "new" and "old" attribute/value pairs + * so that one disappears and one appears atomically. Then we + * must remove the "old" attribute/value pair. + */ + if (args->op_flags & XFS_DA_OP_RENAME) { + /* + * In a separate transaction, set the incomplete flag on the + * "old" attr and clear the incomplete flag on the "new" attr. + */ + error = xfs_attr3_leaf_flipflags(args); + if (error) + return error; + + /* + * Dismantle the "old" attribute/value pair by removing + * a "remote" value (if it exists). + */ + args->index = args->index2; + args->blkno = args->blkno2; + args->rmtblkno = args->rmtblkno2; + args->rmtblkcnt = args->rmtblkcnt2; + args->rmtvaluelen = args->rmtvaluelen2; + if (args->rmtblkno) { + error = xfs_attr_rmtval_remove(args); + if (error) + return error; + } + + /* + * Read in the block containing the "old" attr, then + * remove the "old" attr from that block (neat, huh!) + */ + error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, + -1, &bp); + if (error) + return error; + + xfs_attr3_leaf_remove(bp, args); + + /* + * If the result is small enough, shrink it all into the inode. + */ + if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) { + xfs_bmap_init(args->flist, args->firstblock); + error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); + /* bp is gone due to xfs_da_shrink_inode */ + if (!error) { + error = xfs_bmap_finish(&args->trans, + args->flist, + &committed); + } + if (error) { + ASSERT(committed); + args->trans = NULL; + xfs_bmap_cancel(args->flist); + return error; + } + + /* + * bmap_finish() may have committed the last trans + * and started a new one. We need the inode to be + * in all transactions. + */ + if (committed) + xfs_trans_ijoin(args->trans, dp, 0); + } + + /* + * Commit the remove and start the next trans in series. + */ + error = xfs_trans_roll(&args->trans, dp); + + } else if (args->rmtblkno > 0) { + /* + * Added a "remote" value, just clear the incomplete flag. + */ + error = xfs_attr3_leaf_clearflag(args); + } + return error; +} + +/* + * Remove a name from the leaf attribute list structure + * + * This leaf block cannot have a "remote" value, we only call this routine + * if bmap_one_block() says there is only one block (ie: no remote blks). + */ +STATIC int +xfs_attr_leaf_removename(xfs_da_args_t *args) +{ + xfs_inode_t *dp; + struct xfs_buf *bp; + int error, committed, forkoff; + + trace_xfs_attr_leaf_removename(args); + + /* + * Remove the attribute. + */ + dp = args->dp; + args->blkno = 0; + error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); + if (error) + return error; + + error = xfs_attr3_leaf_lookup_int(bp, args); + if (error == -ENOATTR) { + xfs_trans_brelse(args->trans, bp); + return error; + } + + xfs_attr3_leaf_remove(bp, args); + + /* + * If the result is small enough, shrink it all into the inode. + */ + if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) { + xfs_bmap_init(args->flist, args->firstblock); + error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); + /* bp is gone due to xfs_da_shrink_inode */ + if (!error) { + error = xfs_bmap_finish(&args->trans, args->flist, + &committed); + } + if (error) { + ASSERT(committed); + args->trans = NULL; + xfs_bmap_cancel(args->flist); + return error; + } + + /* + * bmap_finish() may have committed the last trans and started + * a new one. We need the inode to be in all transactions. + */ + if (committed) + xfs_trans_ijoin(args->trans, dp, 0); + } + return 0; +} + +/* + * Look up a name in a leaf attribute list structure. + * + * This leaf block cannot have a "remote" value, we only call this routine + * if bmap_one_block() says there is only one block (ie: no remote blks). + */ +STATIC int +xfs_attr_leaf_get(xfs_da_args_t *args) +{ + struct xfs_buf *bp; + int error; + + trace_xfs_attr_leaf_get(args); + + args->blkno = 0; + error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); + if (error) + return error; + + error = xfs_attr3_leaf_lookup_int(bp, args); + if (error != -EEXIST) { + xfs_trans_brelse(args->trans, bp); + return error; + } + error = xfs_attr3_leaf_getvalue(bp, args); + xfs_trans_brelse(args->trans, bp); + if (!error && (args->rmtblkno > 0) && !(args->flags & ATTR_KERNOVAL)) { + error = xfs_attr_rmtval_get(args); + } + return error; +} + +/*======================================================================== + * External routines when attribute list size > geo->blksize + *========================================================================*/ + +/* + * Add a name to a Btree-format attribute list. + * + * This will involve walking down the Btree, and may involve splitting + * leaf nodes and even splitting intermediate nodes up to and including + * the root node (a special case of an intermediate node). + * + * "Remote" attribute values confuse the issue and atomic rename operations + * add a whole extra layer of confusion on top of that. + */ +STATIC int +xfs_attr_node_addname(xfs_da_args_t *args) +{ + xfs_da_state_t *state; + xfs_da_state_blk_t *blk; + xfs_inode_t *dp; + xfs_mount_t *mp; + int committed, retval, error; + + trace_xfs_attr_node_addname(args); + + /* + * Fill in bucket of arguments/results/context to carry around. + */ + dp = args->dp; + mp = dp->i_mount; +restart: + state = xfs_da_state_alloc(); + state->args = args; + state->mp = mp; + + /* + * Search to see if name already exists, and get back a pointer + * to where it should go. + */ + error = xfs_da3_node_lookup_int(state, &retval); + if (error) + goto out; + blk = &state->path.blk[ state->path.active-1 ]; + ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); + if ((args->flags & ATTR_REPLACE) && (retval == -ENOATTR)) { + goto out; + } else if (retval == -EEXIST) { + if (args->flags & ATTR_CREATE) + goto out; + + trace_xfs_attr_node_replace(args); + + /* save the attribute state for later removal*/ + args->op_flags |= XFS_DA_OP_RENAME; /* atomic rename op */ + args->blkno2 = args->blkno; /* set 2nd entry info*/ + args->index2 = args->index; + args->rmtblkno2 = args->rmtblkno; + args->rmtblkcnt2 = args->rmtblkcnt; + args->rmtvaluelen2 = args->rmtvaluelen; + + /* + * clear the remote attr state now that it is saved so that the + * values reflect the state of the attribute we are about to + * add, not the attribute we just found and will remove later. + */ + args->rmtblkno = 0; + args->rmtblkcnt = 0; + args->rmtvaluelen = 0; + } + + retval = xfs_attr3_leaf_add(blk->bp, state->args); + if (retval == -ENOSPC) { + if (state->path.active == 1) { + /* + * Its really a single leaf node, but it had + * out-of-line values so it looked like it *might* + * have been a b-tree. + */ + xfs_da_state_free(state); + state = NULL; + xfs_bmap_init(args->flist, args->firstblock); + error = xfs_attr3_leaf_to_node(args); + if (!error) { + error = xfs_bmap_finish(&args->trans, + args->flist, + &committed); + } + if (error) { + ASSERT(committed); + args->trans = NULL; + xfs_bmap_cancel(args->flist); + goto out; + } + + /* + * bmap_finish() may have committed the last trans + * and started a new one. We need the inode to be + * in all transactions. + */ + if (committed) + xfs_trans_ijoin(args->trans, dp, 0); + + /* + * Commit the node conversion and start the next + * trans in the chain. + */ + error = xfs_trans_roll(&args->trans, dp); + if (error) + goto out; + + goto restart; + } + + /* + * Split as many Btree elements as required. + * This code tracks the new and old attr's location + * in the index/blkno/rmtblkno/rmtblkcnt fields and + * in the index2/blkno2/rmtblkno2/rmtblkcnt2 fields. + */ + xfs_bmap_init(args->flist, args->firstblock); + error = xfs_da3_split(state); + if (!error) { + error = xfs_bmap_finish(&args->trans, args->flist, + &committed); + } + if (error) { + ASSERT(committed); + args->trans = NULL; + xfs_bmap_cancel(args->flist); + goto out; + } + + /* + * bmap_finish() may have committed the last trans and started + * a new one. We need the inode to be in all transactions. + */ + if (committed) + xfs_trans_ijoin(args->trans, dp, 0); + } else { + /* + * Addition succeeded, update Btree hashvals. + */ + xfs_da3_fixhashpath(state, &state->path); + } + + /* + * Kill the state structure, we're done with it and need to + * allow the buffers to come back later. + */ + xfs_da_state_free(state); + state = NULL; + + /* + * Commit the leaf addition or btree split and start the next + * trans in the chain. + */ + error = xfs_trans_roll(&args->trans, dp); + if (error) + goto out; + + /* + * If there was an out-of-line value, allocate the blocks we + * identified for its storage and copy the value. This is done + * after we create the attribute so that we don't overflow the + * maximum size of a transaction and/or hit a deadlock. + */ + if (args->rmtblkno > 0) { + error = xfs_attr_rmtval_set(args); + if (error) + return error; + } + + /* + * If this is an atomic rename operation, we must "flip" the + * incomplete flags on the "new" and "old" attribute/value pairs + * so that one disappears and one appears atomically. Then we + * must remove the "old" attribute/value pair. + */ + if (args->op_flags & XFS_DA_OP_RENAME) { + /* + * In a separate transaction, set the incomplete flag on the + * "old" attr and clear the incomplete flag on the "new" attr. + */ + error = xfs_attr3_leaf_flipflags(args); + if (error) + goto out; + + /* + * Dismantle the "old" attribute/value pair by removing + * a "remote" value (if it exists). + */ + args->index = args->index2; + args->blkno = args->blkno2; + args->rmtblkno = args->rmtblkno2; + args->rmtblkcnt = args->rmtblkcnt2; + args->rmtvaluelen = args->rmtvaluelen2; + if (args->rmtblkno) { + error = xfs_attr_rmtval_remove(args); + if (error) + return error; + } + + /* + * Re-find the "old" attribute entry after any split ops. + * The INCOMPLETE flag means that we will find the "old" + * attr, not the "new" one. + */ + args->flags |= XFS_ATTR_INCOMPLETE; + state = xfs_da_state_alloc(); + state->args = args; + state->mp = mp; + state->inleaf = 0; + error = xfs_da3_node_lookup_int(state, &retval); + if (error) + goto out; + + /* + * Remove the name and update the hashvals in the tree. + */ + blk = &state->path.blk[ state->path.active-1 ]; + ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); + error = xfs_attr3_leaf_remove(blk->bp, args); + xfs_da3_fixhashpath(state, &state->path); + + /* + * Check to see if the tree needs to be collapsed. + */ + if (retval && (state->path.active > 1)) { + xfs_bmap_init(args->flist, args->firstblock); + error = xfs_da3_join(state); + if (!error) { + error = xfs_bmap_finish(&args->trans, + args->flist, + &committed); + } + if (error) { + ASSERT(committed); + args->trans = NULL; + xfs_bmap_cancel(args->flist); + goto out; + } + + /* + * bmap_finish() may have committed the last trans + * and started a new one. We need the inode to be + * in all transactions. + */ + if (committed) + xfs_trans_ijoin(args->trans, dp, 0); + } + + /* + * Commit and start the next trans in the chain. + */ + error = xfs_trans_roll(&args->trans, dp); + if (error) + goto out; + + } else if (args->rmtblkno > 0) { + /* + * Added a "remote" value, just clear the incomplete flag. + */ + error = xfs_attr3_leaf_clearflag(args); + if (error) + goto out; + } + retval = error = 0; + +out: + if (state) + xfs_da_state_free(state); + if (error) + return error; + return retval; +} + +/* + * Remove a name from a B-tree attribute list. + * + * This will involve walking down the Btree, and may involve joining + * leaf nodes and even joining intermediate nodes up to and including + * the root node (a special case of an intermediate node). + */ +STATIC int +xfs_attr_node_removename(xfs_da_args_t *args) +{ + xfs_da_state_t *state; + xfs_da_state_blk_t *blk; + xfs_inode_t *dp; + struct xfs_buf *bp; + int retval, error, committed, forkoff; + + trace_xfs_attr_node_removename(args); + + /* + * Tie a string around our finger to remind us where we are. + */ + dp = args->dp; + state = xfs_da_state_alloc(); + state->args = args; + state->mp = dp->i_mount; + + /* + * Search to see if name exists, and get back a pointer to it. + */ + error = xfs_da3_node_lookup_int(state, &retval); + if (error || (retval != -EEXIST)) { + if (error == 0) + error = retval; + goto out; + } + + /* + * If there is an out-of-line value, de-allocate the blocks. + * This is done before we remove the attribute so that we don't + * overflow the maximum size of a transaction and/or hit a deadlock. + */ + blk = &state->path.blk[ state->path.active-1 ]; + ASSERT(blk->bp != NULL); + ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); + if (args->rmtblkno > 0) { + /* + * Fill in disk block numbers in the state structure + * so that we can get the buffers back after we commit + * several transactions in the following calls. + */ + error = xfs_attr_fillstate(state); + if (error) + goto out; + + /* + * Mark the attribute as INCOMPLETE, then bunmapi() the + * remote value. + */ + error = xfs_attr3_leaf_setflag(args); + if (error) + goto out; + error = xfs_attr_rmtval_remove(args); + if (error) + goto out; + + /* + * Refill the state structure with buffers, the prior calls + * released our buffers. + */ + error = xfs_attr_refillstate(state); + if (error) + goto out; + } + + /* + * Remove the name and update the hashvals in the tree. + */ + blk = &state->path.blk[ state->path.active-1 ]; + ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); + retval = xfs_attr3_leaf_remove(blk->bp, args); + xfs_da3_fixhashpath(state, &state->path); + + /* + * Check to see if the tree needs to be collapsed. + */ + if (retval && (state->path.active > 1)) { + xfs_bmap_init(args->flist, args->firstblock); + error = xfs_da3_join(state); + if (!error) { + error = xfs_bmap_finish(&args->trans, args->flist, + &committed); + } + if (error) { + ASSERT(committed); + args->trans = NULL; + xfs_bmap_cancel(args->flist); + goto out; + } + + /* + * bmap_finish() may have committed the last trans and started + * a new one. We need the inode to be in all transactions. + */ + if (committed) + xfs_trans_ijoin(args->trans, dp, 0); + + /* + * Commit the Btree join operation and start a new trans. + */ + error = xfs_trans_roll(&args->trans, dp); + if (error) + goto out; + } + + /* + * If the result is small enough, push it all into the inode. + */ + if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) { + /* + * Have to get rid of the copy of this dabuf in the state. + */ + ASSERT(state->path.active == 1); + ASSERT(state->path.blk[0].bp); + state->path.blk[0].bp = NULL; + + error = xfs_attr3_leaf_read(args->trans, args->dp, 0, -1, &bp); + if (error) + goto out; + + if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) { + xfs_bmap_init(args->flist, args->firstblock); + error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); + /* bp is gone due to xfs_da_shrink_inode */ + if (!error) { + error = xfs_bmap_finish(&args->trans, + args->flist, + &committed); + } + if (error) { + ASSERT(committed); + args->trans = NULL; + xfs_bmap_cancel(args->flist); + goto out; + } + + /* + * bmap_finish() may have committed the last trans + * and started a new one. We need the inode to be + * in all transactions. + */ + if (committed) + xfs_trans_ijoin(args->trans, dp, 0); + } else + xfs_trans_brelse(args->trans, bp); + } + error = 0; + +out: + xfs_da_state_free(state); + return error; +} + +/* + * Fill in the disk block numbers in the state structure for the buffers + * that are attached to the state structure. + * This is done so that we can quickly reattach ourselves to those buffers + * after some set of transaction commits have released these buffers. + */ +STATIC int +xfs_attr_fillstate(xfs_da_state_t *state) +{ + xfs_da_state_path_t *path; + xfs_da_state_blk_t *blk; + int level; + + trace_xfs_attr_fillstate(state->args); + + /* + * Roll down the "path" in the state structure, storing the on-disk + * block number for those buffers in the "path". + */ + path = &state->path; + ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); + for (blk = path->blk, level = 0; level < path->active; blk++, level++) { + if (blk->bp) { + blk->disk_blkno = XFS_BUF_ADDR(blk->bp); + blk->bp = NULL; + } else { + blk->disk_blkno = 0; + } + } + + /* + * Roll down the "altpath" in the state structure, storing the on-disk + * block number for those buffers in the "altpath". + */ + path = &state->altpath; + ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); + for (blk = path->blk, level = 0; level < path->active; blk++, level++) { + if (blk->bp) { + blk->disk_blkno = XFS_BUF_ADDR(blk->bp); + blk->bp = NULL; + } else { + blk->disk_blkno = 0; + } + } + + return 0; +} + +/* + * Reattach the buffers to the state structure based on the disk block + * numbers stored in the state structure. + * This is done after some set of transaction commits have released those + * buffers from our grip. + */ +STATIC int +xfs_attr_refillstate(xfs_da_state_t *state) +{ + xfs_da_state_path_t *path; + xfs_da_state_blk_t *blk; + int level, error; + + trace_xfs_attr_refillstate(state->args); + + /* + * Roll down the "path" in the state structure, storing the on-disk + * block number for those buffers in the "path". + */ + path = &state->path; + ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); + for (blk = path->blk, level = 0; level < path->active; blk++, level++) { + if (blk->disk_blkno) { + error = xfs_da3_node_read(state->args->trans, + state->args->dp, + blk->blkno, blk->disk_blkno, + &blk->bp, XFS_ATTR_FORK); + if (error) + return error; + } else { + blk->bp = NULL; + } + } + + /* + * Roll down the "altpath" in the state structure, storing the on-disk + * block number for those buffers in the "altpath". + */ + path = &state->altpath; + ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); + for (blk = path->blk, level = 0; level < path->active; blk++, level++) { + if (blk->disk_blkno) { + error = xfs_da3_node_read(state->args->trans, + state->args->dp, + blk->blkno, blk->disk_blkno, + &blk->bp, XFS_ATTR_FORK); + if (error) + return error; + } else { + blk->bp = NULL; + } + } + + return 0; +} + +/* + * Look up a filename in a node attribute list. + * + * This routine gets called for any attribute fork that has more than one + * block, ie: both true Btree attr lists and for single-leaf-blocks with + * "remote" values taking up more blocks. + */ +STATIC int +xfs_attr_node_get(xfs_da_args_t *args) +{ + xfs_da_state_t *state; + xfs_da_state_blk_t *blk; + int error, retval; + int i; + + trace_xfs_attr_node_get(args); + + state = xfs_da_state_alloc(); + state->args = args; + state->mp = args->dp->i_mount; + + /* + * Search to see if name exists, and get back a pointer to it. + */ + error = xfs_da3_node_lookup_int(state, &retval); + if (error) { + retval = error; + } else if (retval == -EEXIST) { + blk = &state->path.blk[ state->path.active-1 ]; + ASSERT(blk->bp != NULL); + ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); + + /* + * Get the value, local or "remote" + */ + retval = xfs_attr3_leaf_getvalue(blk->bp, args); + if (!retval && (args->rmtblkno > 0) + && !(args->flags & ATTR_KERNOVAL)) { + retval = xfs_attr_rmtval_get(args); + } + } + + /* + * If not in a transaction, we have to release all the buffers. + */ + for (i = 0; i < state->path.active; i++) { + xfs_trans_brelse(args->trans, state->path.blk[i].bp); + state->path.blk[i].bp = NULL; + } + + xfs_da_state_free(state); + return retval; +} diff --git a/kernel/fs/xfs/libxfs/xfs_attr_leaf.c b/kernel/fs/xfs/libxfs/xfs_attr_leaf.c new file mode 100644 index 000000000..e9d401ce9 --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_attr_leaf.c @@ -0,0 +1,2773 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_inode_item.h" +#include "xfs_bmap_btree.h" +#include "xfs_bmap.h" +#include "xfs_attr_sf.h" +#include "xfs_attr_remote.h" +#include "xfs_attr.h" +#include "xfs_attr_leaf.h" +#include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_buf_item.h" +#include "xfs_cksum.h" +#include "xfs_dir2.h" + + +/* + * xfs_attr_leaf.c + * + * Routines to implement leaf blocks of attributes as Btrees of hashed names. + */ + +/*======================================================================== + * Function prototypes for the kernel. + *========================================================================*/ + +/* + * Routines used for growing the Btree. + */ +STATIC int xfs_attr3_leaf_create(struct xfs_da_args *args, + xfs_dablk_t which_block, struct xfs_buf **bpp); +STATIC int xfs_attr3_leaf_add_work(struct xfs_buf *leaf_buffer, + struct xfs_attr3_icleaf_hdr *ichdr, + struct xfs_da_args *args, int freemap_index); +STATIC void xfs_attr3_leaf_compact(struct xfs_da_args *args, + struct xfs_attr3_icleaf_hdr *ichdr, + struct xfs_buf *leaf_buffer); +STATIC void xfs_attr3_leaf_rebalance(xfs_da_state_t *state, + xfs_da_state_blk_t *blk1, + xfs_da_state_blk_t *blk2); +STATIC int xfs_attr3_leaf_figure_balance(xfs_da_state_t *state, + xfs_da_state_blk_t *leaf_blk_1, + struct xfs_attr3_icleaf_hdr *ichdr1, + xfs_da_state_blk_t *leaf_blk_2, + struct xfs_attr3_icleaf_hdr *ichdr2, + int *number_entries_in_blk1, + int *number_usedbytes_in_blk1); + +/* + * Utility routines. + */ +STATIC void xfs_attr3_leaf_moveents(struct xfs_da_args *args, + struct xfs_attr_leafblock *src_leaf, + struct xfs_attr3_icleaf_hdr *src_ichdr, int src_start, + struct xfs_attr_leafblock *dst_leaf, + struct xfs_attr3_icleaf_hdr *dst_ichdr, int dst_start, + int move_count); +STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index); + +/* + * attr3 block 'firstused' conversion helpers. + * + * firstused refers to the offset of the first used byte of the nameval region + * of an attr leaf block. The region starts at the tail of the block and expands + * backwards towards the middle. As such, firstused is initialized to the block + * size for an empty leaf block and is reduced from there. + * + * The attr3 block size is pegged to the fsb size and the maximum fsb is 64k. + * The in-core firstused field is 32-bit and thus supports the maximum fsb size. + * The on-disk field is only 16-bit, however, and overflows at 64k. Since this + * only occurs at exactly 64k, we use zero as a magic on-disk value to represent + * the attr block size. The following helpers manage the conversion between the + * in-core and on-disk formats. + */ + +static void +xfs_attr3_leaf_firstused_from_disk( + struct xfs_da_geometry *geo, + struct xfs_attr3_icleaf_hdr *to, + struct xfs_attr_leafblock *from) +{ + struct xfs_attr3_leaf_hdr *hdr3; + + if (from->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)) { + hdr3 = (struct xfs_attr3_leaf_hdr *) from; + to->firstused = be16_to_cpu(hdr3->firstused); + } else { + to->firstused = be16_to_cpu(from->hdr.firstused); + } + + /* + * Convert from the magic fsb size value to actual blocksize. This + * should only occur for empty blocks when the block size overflows + * 16-bits. + */ + if (to->firstused == XFS_ATTR3_LEAF_NULLOFF) { + ASSERT(!to->count && !to->usedbytes); + ASSERT(geo->blksize > USHRT_MAX); + to->firstused = geo->blksize; + } +} + +static void +xfs_attr3_leaf_firstused_to_disk( + struct xfs_da_geometry *geo, + struct xfs_attr_leafblock *to, + struct xfs_attr3_icleaf_hdr *from) +{ + struct xfs_attr3_leaf_hdr *hdr3; + uint32_t firstused; + + /* magic value should only be seen on disk */ + ASSERT(from->firstused != XFS_ATTR3_LEAF_NULLOFF); + + /* + * Scale down the 32-bit in-core firstused value to the 16-bit on-disk + * value. This only overflows at the max supported value of 64k. Use the + * magic on-disk value to represent block size in this case. + */ + firstused = from->firstused; + if (firstused > USHRT_MAX) { + ASSERT(from->firstused == geo->blksize); + firstused = XFS_ATTR3_LEAF_NULLOFF; + } + + if (from->magic == XFS_ATTR3_LEAF_MAGIC) { + hdr3 = (struct xfs_attr3_leaf_hdr *) to; + hdr3->firstused = cpu_to_be16(firstused); + } else { + to->hdr.firstused = cpu_to_be16(firstused); + } +} + +void +xfs_attr3_leaf_hdr_from_disk( + struct xfs_da_geometry *geo, + struct xfs_attr3_icleaf_hdr *to, + struct xfs_attr_leafblock *from) +{ + int i; + + ASSERT(from->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC) || + from->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)); + + if (from->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)) { + struct xfs_attr3_leaf_hdr *hdr3 = (struct xfs_attr3_leaf_hdr *)from; + + to->forw = be32_to_cpu(hdr3->info.hdr.forw); + to->back = be32_to_cpu(hdr3->info.hdr.back); + to->magic = be16_to_cpu(hdr3->info.hdr.magic); + to->count = be16_to_cpu(hdr3->count); + to->usedbytes = be16_to_cpu(hdr3->usedbytes); + xfs_attr3_leaf_firstused_from_disk(geo, to, from); + to->holes = hdr3->holes; + + for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) { + to->freemap[i].base = be16_to_cpu(hdr3->freemap[i].base); + to->freemap[i].size = be16_to_cpu(hdr3->freemap[i].size); + } + return; + } + to->forw = be32_to_cpu(from->hdr.info.forw); + to->back = be32_to_cpu(from->hdr.info.back); + to->magic = be16_to_cpu(from->hdr.info.magic); + to->count = be16_to_cpu(from->hdr.count); + to->usedbytes = be16_to_cpu(from->hdr.usedbytes); + xfs_attr3_leaf_firstused_from_disk(geo, to, from); + to->holes = from->hdr.holes; + + for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) { + to->freemap[i].base = be16_to_cpu(from->hdr.freemap[i].base); + to->freemap[i].size = be16_to_cpu(from->hdr.freemap[i].size); + } +} + +void +xfs_attr3_leaf_hdr_to_disk( + struct xfs_da_geometry *geo, + struct xfs_attr_leafblock *to, + struct xfs_attr3_icleaf_hdr *from) +{ + int i; + + ASSERT(from->magic == XFS_ATTR_LEAF_MAGIC || + from->magic == XFS_ATTR3_LEAF_MAGIC); + + if (from->magic == XFS_ATTR3_LEAF_MAGIC) { + struct xfs_attr3_leaf_hdr *hdr3 = (struct xfs_attr3_leaf_hdr *)to; + + hdr3->info.hdr.forw = cpu_to_be32(from->forw); + hdr3->info.hdr.back = cpu_to_be32(from->back); + hdr3->info.hdr.magic = cpu_to_be16(from->magic); + hdr3->count = cpu_to_be16(from->count); + hdr3->usedbytes = cpu_to_be16(from->usedbytes); + xfs_attr3_leaf_firstused_to_disk(geo, to, from); + hdr3->holes = from->holes; + hdr3->pad1 = 0; + + for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) { + hdr3->freemap[i].base = cpu_to_be16(from->freemap[i].base); + hdr3->freemap[i].size = cpu_to_be16(from->freemap[i].size); + } + return; + } + to->hdr.info.forw = cpu_to_be32(from->forw); + to->hdr.info.back = cpu_to_be32(from->back); + to->hdr.info.magic = cpu_to_be16(from->magic); + to->hdr.count = cpu_to_be16(from->count); + to->hdr.usedbytes = cpu_to_be16(from->usedbytes); + xfs_attr3_leaf_firstused_to_disk(geo, to, from); + to->hdr.holes = from->holes; + to->hdr.pad1 = 0; + + for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) { + to->hdr.freemap[i].base = cpu_to_be16(from->freemap[i].base); + to->hdr.freemap[i].size = cpu_to_be16(from->freemap[i].size); + } +} + +static bool +xfs_attr3_leaf_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_attr_leafblock *leaf = bp->b_addr; + struct xfs_attr3_icleaf_hdr ichdr; + + xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf); + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_da3_node_hdr *hdr3 = bp->b_addr; + + if (ichdr.magic != XFS_ATTR3_LEAF_MAGIC) + return false; + + if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_uuid)) + return false; + if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn) + return false; + } else { + if (ichdr.magic != XFS_ATTR_LEAF_MAGIC) + return false; + } + if (ichdr.count == 0) + return false; + + /* XXX: need to range check rest of attr header values */ + /* XXX: hash order check? */ + + return true; +} + +static void +xfs_attr3_leaf_write_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; + struct xfs_attr3_leaf_hdr *hdr3 = bp->b_addr; + + if (!xfs_attr3_leaf_verify(bp)) { + xfs_buf_ioerror(bp, -EFSCORRUPTED); + xfs_verifier_error(bp); + return; + } + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (bip) + hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn); + + xfs_buf_update_cksum(bp, XFS_ATTR3_LEAF_CRC_OFF); +} + +/* + * leaf/node format detection on trees is sketchy, so a node read can be done on + * leaf level blocks when detection identifies the tree as a node format tree + * incorrectly. In this case, we need to swap the verifier to match the correct + * format of the block being read. + */ +static void +xfs_attr3_leaf_read_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + + if (xfs_sb_version_hascrc(&mp->m_sb) && + !xfs_buf_verify_cksum(bp, XFS_ATTR3_LEAF_CRC_OFF)) + xfs_buf_ioerror(bp, -EFSBADCRC); + else if (!xfs_attr3_leaf_verify(bp)) + xfs_buf_ioerror(bp, -EFSCORRUPTED); + + if (bp->b_error) + xfs_verifier_error(bp); +} + +const struct xfs_buf_ops xfs_attr3_leaf_buf_ops = { + .verify_read = xfs_attr3_leaf_read_verify, + .verify_write = xfs_attr3_leaf_write_verify, +}; + +int +xfs_attr3_leaf_read( + struct xfs_trans *tp, + struct xfs_inode *dp, + xfs_dablk_t bno, + xfs_daddr_t mappedbno, + struct xfs_buf **bpp) +{ + int err; + + err = xfs_da_read_buf(tp, dp, bno, mappedbno, bpp, + XFS_ATTR_FORK, &xfs_attr3_leaf_buf_ops); + if (!err && tp) + xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_ATTR_LEAF_BUF); + return err; +} + +/*======================================================================== + * Namespace helper routines + *========================================================================*/ + +/* + * If namespace bits don't match return 0. + * If all match then return 1. + */ +STATIC int +xfs_attr_namesp_match(int arg_flags, int ondisk_flags) +{ + return XFS_ATTR_NSP_ONDISK(ondisk_flags) == XFS_ATTR_NSP_ARGS_TO_ONDISK(arg_flags); +} + + +/*======================================================================== + * External routines when attribute fork size < XFS_LITINO(mp). + *========================================================================*/ + +/* + * Query whether the requested number of additional bytes of extended + * attribute space will be able to fit inline. + * + * Returns zero if not, else the di_forkoff fork offset to be used in the + * literal area for attribute data once the new bytes have been added. + * + * di_forkoff must be 8 byte aligned, hence is stored as a >>3 value; + * special case for dev/uuid inodes, they have fixed size data forks. + */ +int +xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes) +{ + int offset; + int minforkoff; /* lower limit on valid forkoff locations */ + int maxforkoff; /* upper limit on valid forkoff locations */ + int dsize; + xfs_mount_t *mp = dp->i_mount; + + /* rounded down */ + offset = (XFS_LITINO(mp, dp->i_d.di_version) - bytes) >> 3; + + switch (dp->i_d.di_format) { + case XFS_DINODE_FMT_DEV: + minforkoff = roundup(sizeof(xfs_dev_t), 8) >> 3; + return (offset >= minforkoff) ? minforkoff : 0; + case XFS_DINODE_FMT_UUID: + minforkoff = roundup(sizeof(uuid_t), 8) >> 3; + return (offset >= minforkoff) ? minforkoff : 0; + } + + /* + * If the requested numbers of bytes is smaller or equal to the + * current attribute fork size we can always proceed. + * + * Note that if_bytes in the data fork might actually be larger than + * the current data fork size is due to delalloc extents. In that + * case either the extent count will go down when they are converted + * to real extents, or the delalloc conversion will take care of the + * literal area rebalancing. + */ + if (bytes <= XFS_IFORK_ASIZE(dp)) + return dp->i_d.di_forkoff; + + /* + * For attr2 we can try to move the forkoff if there is space in the + * literal area, but for the old format we are done if there is no + * space in the fixed attribute fork. + */ + if (!(mp->m_flags & XFS_MOUNT_ATTR2)) + return 0; + + dsize = dp->i_df.if_bytes; + + switch (dp->i_d.di_format) { + case XFS_DINODE_FMT_EXTENTS: + /* + * If there is no attr fork and the data fork is extents, + * determine if creating the default attr fork will result + * in the extents form migrating to btree. If so, the + * minimum offset only needs to be the space required for + * the btree root. + */ + if (!dp->i_d.di_forkoff && dp->i_df.if_bytes > + xfs_default_attroffset(dp)) + dsize = XFS_BMDR_SPACE_CALC(MINDBTPTRS); + break; + case XFS_DINODE_FMT_BTREE: + /* + * If we have a data btree then keep forkoff if we have one, + * otherwise we are adding a new attr, so then we set + * minforkoff to where the btree root can finish so we have + * plenty of room for attrs + */ + if (dp->i_d.di_forkoff) { + if (offset < dp->i_d.di_forkoff) + return 0; + return dp->i_d.di_forkoff; + } + dsize = XFS_BMAP_BROOT_SPACE(mp, dp->i_df.if_broot); + break; + } + + /* + * A data fork btree root must have space for at least + * MINDBTPTRS key/ptr pairs if the data fork is small or empty. + */ + minforkoff = MAX(dsize, XFS_BMDR_SPACE_CALC(MINDBTPTRS)); + minforkoff = roundup(minforkoff, 8) >> 3; + + /* attr fork btree root can have at least this many key/ptr pairs */ + maxforkoff = XFS_LITINO(mp, dp->i_d.di_version) - + XFS_BMDR_SPACE_CALC(MINABTPTRS); + maxforkoff = maxforkoff >> 3; /* rounded down */ + + if (offset >= maxforkoff) + return maxforkoff; + if (offset >= minforkoff) + return offset; + return 0; +} + +/* + * Switch on the ATTR2 superblock bit (implies also FEATURES2) + */ +STATIC void +xfs_sbversion_add_attr2(xfs_mount_t *mp, xfs_trans_t *tp) +{ + if ((mp->m_flags & XFS_MOUNT_ATTR2) && + !(xfs_sb_version_hasattr2(&mp->m_sb))) { + spin_lock(&mp->m_sb_lock); + if (!xfs_sb_version_hasattr2(&mp->m_sb)) { + xfs_sb_version_addattr2(&mp->m_sb); + spin_unlock(&mp->m_sb_lock); + xfs_log_sb(tp); + } else + spin_unlock(&mp->m_sb_lock); + } +} + +/* + * Create the initial contents of a shortform attribute list. + */ +void +xfs_attr_shortform_create(xfs_da_args_t *args) +{ + xfs_attr_sf_hdr_t *hdr; + xfs_inode_t *dp; + xfs_ifork_t *ifp; + + trace_xfs_attr_sf_create(args); + + dp = args->dp; + ASSERT(dp != NULL); + ifp = dp->i_afp; + ASSERT(ifp != NULL); + ASSERT(ifp->if_bytes == 0); + if (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS) { + ifp->if_flags &= ~XFS_IFEXTENTS; /* just in case */ + dp->i_d.di_aformat = XFS_DINODE_FMT_LOCAL; + ifp->if_flags |= XFS_IFINLINE; + } else { + ASSERT(ifp->if_flags & XFS_IFINLINE); + } + xfs_idata_realloc(dp, sizeof(*hdr), XFS_ATTR_FORK); + hdr = (xfs_attr_sf_hdr_t *)ifp->if_u1.if_data; + hdr->count = 0; + hdr->totsize = cpu_to_be16(sizeof(*hdr)); + xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA); +} + +/* + * Add a name/value pair to the shortform attribute list. + * Overflow from the inode has already been checked for. + */ +void +xfs_attr_shortform_add(xfs_da_args_t *args, int forkoff) +{ + xfs_attr_shortform_t *sf; + xfs_attr_sf_entry_t *sfe; + int i, offset, size; + xfs_mount_t *mp; + xfs_inode_t *dp; + xfs_ifork_t *ifp; + + trace_xfs_attr_sf_add(args); + + dp = args->dp; + mp = dp->i_mount; + dp->i_d.di_forkoff = forkoff; + + ifp = dp->i_afp; + ASSERT(ifp->if_flags & XFS_IFINLINE); + sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data; + sfe = &sf->list[0]; + for (i = 0; i < sf->hdr.count; sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) { +#ifdef DEBUG + if (sfe->namelen != args->namelen) + continue; + if (memcmp(args->name, sfe->nameval, args->namelen) != 0) + continue; + if (!xfs_attr_namesp_match(args->flags, sfe->flags)) + continue; + ASSERT(0); +#endif + } + + offset = (char *)sfe - (char *)sf; + size = XFS_ATTR_SF_ENTSIZE_BYNAME(args->namelen, args->valuelen); + xfs_idata_realloc(dp, size, XFS_ATTR_FORK); + sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data; + sfe = (xfs_attr_sf_entry_t *)((char *)sf + offset); + + sfe->namelen = args->namelen; + sfe->valuelen = args->valuelen; + sfe->flags = XFS_ATTR_NSP_ARGS_TO_ONDISK(args->flags); + memcpy(sfe->nameval, args->name, args->namelen); + memcpy(&sfe->nameval[args->namelen], args->value, args->valuelen); + sf->hdr.count++; + be16_add_cpu(&sf->hdr.totsize, size); + xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA); + + xfs_sbversion_add_attr2(mp, args->trans); +} + +/* + * After the last attribute is removed revert to original inode format, + * making all literal area available to the data fork once more. + */ +void +xfs_attr_fork_remove( + struct xfs_inode *ip, + struct xfs_trans *tp) +{ + xfs_idestroy_fork(ip, XFS_ATTR_FORK); + ip->i_d.di_forkoff = 0; + ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; + + ASSERT(ip->i_d.di_anextents == 0); + ASSERT(ip->i_afp == NULL); + + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); +} + +/* + * Remove an attribute from the shortform attribute list structure. + */ +int +xfs_attr_shortform_remove(xfs_da_args_t *args) +{ + xfs_attr_shortform_t *sf; + xfs_attr_sf_entry_t *sfe; + int base, size=0, end, totsize, i; + xfs_mount_t *mp; + xfs_inode_t *dp; + + trace_xfs_attr_sf_remove(args); + + dp = args->dp; + mp = dp->i_mount; + base = sizeof(xfs_attr_sf_hdr_t); + sf = (xfs_attr_shortform_t *)dp->i_afp->if_u1.if_data; + sfe = &sf->list[0]; + end = sf->hdr.count; + for (i = 0; i < end; sfe = XFS_ATTR_SF_NEXTENTRY(sfe), + base += size, i++) { + size = XFS_ATTR_SF_ENTSIZE(sfe); + if (sfe->namelen != args->namelen) + continue; + if (memcmp(sfe->nameval, args->name, args->namelen) != 0) + continue; + if (!xfs_attr_namesp_match(args->flags, sfe->flags)) + continue; + break; + } + if (i == end) + return -ENOATTR; + + /* + * Fix up the attribute fork data, covering the hole + */ + end = base + size; + totsize = be16_to_cpu(sf->hdr.totsize); + if (end != totsize) + memmove(&((char *)sf)[base], &((char *)sf)[end], totsize - end); + sf->hdr.count--; + be16_add_cpu(&sf->hdr.totsize, -size); + + /* + * Fix up the start offset of the attribute fork + */ + totsize -= size; + if (totsize == sizeof(xfs_attr_sf_hdr_t) && + (mp->m_flags & XFS_MOUNT_ATTR2) && + (dp->i_d.di_format != XFS_DINODE_FMT_BTREE) && + !(args->op_flags & XFS_DA_OP_ADDNAME)) { + xfs_attr_fork_remove(dp, args->trans); + } else { + xfs_idata_realloc(dp, -size, XFS_ATTR_FORK); + dp->i_d.di_forkoff = xfs_attr_shortform_bytesfit(dp, totsize); + ASSERT(dp->i_d.di_forkoff); + ASSERT(totsize > sizeof(xfs_attr_sf_hdr_t) || + (args->op_flags & XFS_DA_OP_ADDNAME) || + !(mp->m_flags & XFS_MOUNT_ATTR2) || + dp->i_d.di_format == XFS_DINODE_FMT_BTREE); + xfs_trans_log_inode(args->trans, dp, + XFS_ILOG_CORE | XFS_ILOG_ADATA); + } + + xfs_sbversion_add_attr2(mp, args->trans); + + return 0; +} + +/* + * Look up a name in a shortform attribute list structure. + */ +/*ARGSUSED*/ +int +xfs_attr_shortform_lookup(xfs_da_args_t *args) +{ + xfs_attr_shortform_t *sf; + xfs_attr_sf_entry_t *sfe; + int i; + xfs_ifork_t *ifp; + + trace_xfs_attr_sf_lookup(args); + + ifp = args->dp->i_afp; + ASSERT(ifp->if_flags & XFS_IFINLINE); + sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data; + sfe = &sf->list[0]; + for (i = 0; i < sf->hdr.count; + sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) { + if (sfe->namelen != args->namelen) + continue; + if (memcmp(args->name, sfe->nameval, args->namelen) != 0) + continue; + if (!xfs_attr_namesp_match(args->flags, sfe->flags)) + continue; + return -EEXIST; + } + return -ENOATTR; +} + +/* + * Look up a name in a shortform attribute list structure. + */ +/*ARGSUSED*/ +int +xfs_attr_shortform_getvalue(xfs_da_args_t *args) +{ + xfs_attr_shortform_t *sf; + xfs_attr_sf_entry_t *sfe; + int i; + + ASSERT(args->dp->i_afp->if_flags == XFS_IFINLINE); + sf = (xfs_attr_shortform_t *)args->dp->i_afp->if_u1.if_data; + sfe = &sf->list[0]; + for (i = 0; i < sf->hdr.count; + sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) { + if (sfe->namelen != args->namelen) + continue; + if (memcmp(args->name, sfe->nameval, args->namelen) != 0) + continue; + if (!xfs_attr_namesp_match(args->flags, sfe->flags)) + continue; + if (args->flags & ATTR_KERNOVAL) { + args->valuelen = sfe->valuelen; + return -EEXIST; + } + if (args->valuelen < sfe->valuelen) { + args->valuelen = sfe->valuelen; + return -ERANGE; + } + args->valuelen = sfe->valuelen; + memcpy(args->value, &sfe->nameval[args->namelen], + args->valuelen); + return -EEXIST; + } + return -ENOATTR; +} + +/* + * Convert from using the shortform to the leaf. + */ +int +xfs_attr_shortform_to_leaf(xfs_da_args_t *args) +{ + xfs_inode_t *dp; + xfs_attr_shortform_t *sf; + xfs_attr_sf_entry_t *sfe; + xfs_da_args_t nargs; + char *tmpbuffer; + int error, i, size; + xfs_dablk_t blkno; + struct xfs_buf *bp; + xfs_ifork_t *ifp; + + trace_xfs_attr_sf_to_leaf(args); + + dp = args->dp; + ifp = dp->i_afp; + sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data; + size = be16_to_cpu(sf->hdr.totsize); + tmpbuffer = kmem_alloc(size, KM_SLEEP); + ASSERT(tmpbuffer != NULL); + memcpy(tmpbuffer, ifp->if_u1.if_data, size); + sf = (xfs_attr_shortform_t *)tmpbuffer; + + xfs_idata_realloc(dp, -size, XFS_ATTR_FORK); + xfs_bmap_local_to_extents_empty(dp, XFS_ATTR_FORK); + + bp = NULL; + error = xfs_da_grow_inode(args, &blkno); + if (error) { + /* + * If we hit an IO error middle of the transaction inside + * grow_inode(), we may have inconsistent data. Bail out. + */ + if (error == -EIO) + goto out; + xfs_idata_realloc(dp, size, XFS_ATTR_FORK); /* try to put */ + memcpy(ifp->if_u1.if_data, tmpbuffer, size); /* it back */ + goto out; + } + + ASSERT(blkno == 0); + error = xfs_attr3_leaf_create(args, blkno, &bp); + if (error) { + error = xfs_da_shrink_inode(args, 0, bp); + bp = NULL; + if (error) + goto out; + xfs_idata_realloc(dp, size, XFS_ATTR_FORK); /* try to put */ + memcpy(ifp->if_u1.if_data, tmpbuffer, size); /* it back */ + goto out; + } + + memset((char *)&nargs, 0, sizeof(nargs)); + nargs.dp = dp; + nargs.geo = args->geo; + nargs.firstblock = args->firstblock; + nargs.flist = args->flist; + nargs.total = args->total; + nargs.whichfork = XFS_ATTR_FORK; + nargs.trans = args->trans; + nargs.op_flags = XFS_DA_OP_OKNOENT; + + sfe = &sf->list[0]; + for (i = 0; i < sf->hdr.count; i++) { + nargs.name = sfe->nameval; + nargs.namelen = sfe->namelen; + nargs.value = &sfe->nameval[nargs.namelen]; + nargs.valuelen = sfe->valuelen; + nargs.hashval = xfs_da_hashname(sfe->nameval, + sfe->namelen); + nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(sfe->flags); + error = xfs_attr3_leaf_lookup_int(bp, &nargs); /* set a->index */ + ASSERT(error == -ENOATTR); + error = xfs_attr3_leaf_add(bp, &nargs); + ASSERT(error != -ENOSPC); + if (error) + goto out; + sfe = XFS_ATTR_SF_NEXTENTRY(sfe); + } + error = 0; + +out: + kmem_free(tmpbuffer); + return error; +} + +/* + * Check a leaf attribute block to see if all the entries would fit into + * a shortform attribute list. + */ +int +xfs_attr_shortform_allfit( + struct xfs_buf *bp, + struct xfs_inode *dp) +{ + struct xfs_attr_leafblock *leaf; + struct xfs_attr_leaf_entry *entry; + xfs_attr_leaf_name_local_t *name_loc; + struct xfs_attr3_icleaf_hdr leafhdr; + int bytes; + int i; + struct xfs_mount *mp = bp->b_target->bt_mount; + + leaf = bp->b_addr; + xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &leafhdr, leaf); + entry = xfs_attr3_leaf_entryp(leaf); + + bytes = sizeof(struct xfs_attr_sf_hdr); + for (i = 0; i < leafhdr.count; entry++, i++) { + if (entry->flags & XFS_ATTR_INCOMPLETE) + continue; /* don't copy partial entries */ + if (!(entry->flags & XFS_ATTR_LOCAL)) + return 0; + name_loc = xfs_attr3_leaf_name_local(leaf, i); + if (name_loc->namelen >= XFS_ATTR_SF_ENTSIZE_MAX) + return 0; + if (be16_to_cpu(name_loc->valuelen) >= XFS_ATTR_SF_ENTSIZE_MAX) + return 0; + bytes += sizeof(struct xfs_attr_sf_entry) - 1 + + name_loc->namelen + + be16_to_cpu(name_loc->valuelen); + } + if ((dp->i_mount->m_flags & XFS_MOUNT_ATTR2) && + (dp->i_d.di_format != XFS_DINODE_FMT_BTREE) && + (bytes == sizeof(struct xfs_attr_sf_hdr))) + return -1; + return xfs_attr_shortform_bytesfit(dp, bytes); +} + +/* + * Convert a leaf attribute list to shortform attribute list + */ +int +xfs_attr3_leaf_to_shortform( + struct xfs_buf *bp, + struct xfs_da_args *args, + int forkoff) +{ + struct xfs_attr_leafblock *leaf; + struct xfs_attr3_icleaf_hdr ichdr; + struct xfs_attr_leaf_entry *entry; + struct xfs_attr_leaf_name_local *name_loc; + struct xfs_da_args nargs; + struct xfs_inode *dp = args->dp; + char *tmpbuffer; + int error; + int i; + + trace_xfs_attr_leaf_to_sf(args); + + tmpbuffer = kmem_alloc(args->geo->blksize, KM_SLEEP); + if (!tmpbuffer) + return -ENOMEM; + + memcpy(tmpbuffer, bp->b_addr, args->geo->blksize); + + leaf = (xfs_attr_leafblock_t *)tmpbuffer; + xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf); + entry = xfs_attr3_leaf_entryp(leaf); + + /* XXX (dgc): buffer is about to be marked stale - why zero it? */ + memset(bp->b_addr, 0, args->geo->blksize); + + /* + * Clean out the prior contents of the attribute list. + */ + error = xfs_da_shrink_inode(args, 0, bp); + if (error) + goto out; + + if (forkoff == -1) { + ASSERT(dp->i_mount->m_flags & XFS_MOUNT_ATTR2); + ASSERT(dp->i_d.di_format != XFS_DINODE_FMT_BTREE); + xfs_attr_fork_remove(dp, args->trans); + goto out; + } + + xfs_attr_shortform_create(args); + + /* + * Copy the attributes + */ + memset((char *)&nargs, 0, sizeof(nargs)); + nargs.geo = args->geo; + nargs.dp = dp; + nargs.firstblock = args->firstblock; + nargs.flist = args->flist; + nargs.total = args->total; + nargs.whichfork = XFS_ATTR_FORK; + nargs.trans = args->trans; + nargs.op_flags = XFS_DA_OP_OKNOENT; + + for (i = 0; i < ichdr.count; entry++, i++) { + if (entry->flags & XFS_ATTR_INCOMPLETE) + continue; /* don't copy partial entries */ + if (!entry->nameidx) + continue; + ASSERT(entry->flags & XFS_ATTR_LOCAL); + name_loc = xfs_attr3_leaf_name_local(leaf, i); + nargs.name = name_loc->nameval; + nargs.namelen = name_loc->namelen; + nargs.value = &name_loc->nameval[nargs.namelen]; + nargs.valuelen = be16_to_cpu(name_loc->valuelen); + nargs.hashval = be32_to_cpu(entry->hashval); + nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(entry->flags); + xfs_attr_shortform_add(&nargs, forkoff); + } + error = 0; + +out: + kmem_free(tmpbuffer); + return error; +} + +/* + * Convert from using a single leaf to a root node and a leaf. + */ +int +xfs_attr3_leaf_to_node( + struct xfs_da_args *args) +{ + struct xfs_attr_leafblock *leaf; + struct xfs_attr3_icleaf_hdr icleafhdr; + struct xfs_attr_leaf_entry *entries; + struct xfs_da_node_entry *btree; + struct xfs_da3_icnode_hdr icnodehdr; + struct xfs_da_intnode *node; + struct xfs_inode *dp = args->dp; + struct xfs_mount *mp = dp->i_mount; + struct xfs_buf *bp1 = NULL; + struct xfs_buf *bp2 = NULL; + xfs_dablk_t blkno; + int error; + + trace_xfs_attr_leaf_to_node(args); + + error = xfs_da_grow_inode(args, &blkno); + if (error) + goto out; + error = xfs_attr3_leaf_read(args->trans, dp, 0, -1, &bp1); + if (error) + goto out; + + error = xfs_da_get_buf(args->trans, dp, blkno, -1, &bp2, XFS_ATTR_FORK); + if (error) + goto out; + + /* copy leaf to new buffer, update identifiers */ + xfs_trans_buf_set_type(args->trans, bp2, XFS_BLFT_ATTR_LEAF_BUF); + bp2->b_ops = bp1->b_ops; + memcpy(bp2->b_addr, bp1->b_addr, args->geo->blksize); + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_da3_blkinfo *hdr3 = bp2->b_addr; + hdr3->blkno = cpu_to_be64(bp2->b_bn); + } + xfs_trans_log_buf(args->trans, bp2, 0, args->geo->blksize - 1); + + /* + * Set up the new root node. + */ + error = xfs_da3_node_create(args, 0, 1, &bp1, XFS_ATTR_FORK); + if (error) + goto out; + node = bp1->b_addr; + dp->d_ops->node_hdr_from_disk(&icnodehdr, node); + btree = dp->d_ops->node_tree_p(node); + + leaf = bp2->b_addr; + xfs_attr3_leaf_hdr_from_disk(args->geo, &icleafhdr, leaf); + entries = xfs_attr3_leaf_entryp(leaf); + + /* both on-disk, don't endian-flip twice */ + btree[0].hashval = entries[icleafhdr.count - 1].hashval; + btree[0].before = cpu_to_be32(blkno); + icnodehdr.count = 1; + dp->d_ops->node_hdr_to_disk(node, &icnodehdr); + xfs_trans_log_buf(args->trans, bp1, 0, args->geo->blksize - 1); + error = 0; +out: + return error; +} + +/*======================================================================== + * Routines used for growing the Btree. + *========================================================================*/ + +/* + * Create the initial contents of a leaf attribute list + * or a leaf in a node attribute list. + */ +STATIC int +xfs_attr3_leaf_create( + struct xfs_da_args *args, + xfs_dablk_t blkno, + struct xfs_buf **bpp) +{ + struct xfs_attr_leafblock *leaf; + struct xfs_attr3_icleaf_hdr ichdr; + struct xfs_inode *dp = args->dp; + struct xfs_mount *mp = dp->i_mount; + struct xfs_buf *bp; + int error; + + trace_xfs_attr_leaf_create(args); + + error = xfs_da_get_buf(args->trans, args->dp, blkno, -1, &bp, + XFS_ATTR_FORK); + if (error) + return error; + bp->b_ops = &xfs_attr3_leaf_buf_ops; + xfs_trans_buf_set_type(args->trans, bp, XFS_BLFT_ATTR_LEAF_BUF); + leaf = bp->b_addr; + memset(leaf, 0, args->geo->blksize); + + memset(&ichdr, 0, sizeof(ichdr)); + ichdr.firstused = args->geo->blksize; + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_da3_blkinfo *hdr3 = bp->b_addr; + + ichdr.magic = XFS_ATTR3_LEAF_MAGIC; + + hdr3->blkno = cpu_to_be64(bp->b_bn); + hdr3->owner = cpu_to_be64(dp->i_ino); + uuid_copy(&hdr3->uuid, &mp->m_sb.sb_uuid); + + ichdr.freemap[0].base = sizeof(struct xfs_attr3_leaf_hdr); + } else { + ichdr.magic = XFS_ATTR_LEAF_MAGIC; + ichdr.freemap[0].base = sizeof(struct xfs_attr_leaf_hdr); + } + ichdr.freemap[0].size = ichdr.firstused - ichdr.freemap[0].base; + + xfs_attr3_leaf_hdr_to_disk(args->geo, leaf, &ichdr); + xfs_trans_log_buf(args->trans, bp, 0, args->geo->blksize - 1); + + *bpp = bp; + return 0; +} + +/* + * Split the leaf node, rebalance, then add the new entry. + */ +int +xfs_attr3_leaf_split( + struct xfs_da_state *state, + struct xfs_da_state_blk *oldblk, + struct xfs_da_state_blk *newblk) +{ + xfs_dablk_t blkno; + int error; + + trace_xfs_attr_leaf_split(state->args); + + /* + * Allocate space for a new leaf node. + */ + ASSERT(oldblk->magic == XFS_ATTR_LEAF_MAGIC); + error = xfs_da_grow_inode(state->args, &blkno); + if (error) + return error; + error = xfs_attr3_leaf_create(state->args, blkno, &newblk->bp); + if (error) + return error; + newblk->blkno = blkno; + newblk->magic = XFS_ATTR_LEAF_MAGIC; + + /* + * Rebalance the entries across the two leaves. + * NOTE: rebalance() currently depends on the 2nd block being empty. + */ + xfs_attr3_leaf_rebalance(state, oldblk, newblk); + error = xfs_da3_blk_link(state, oldblk, newblk); + if (error) + return error; + + /* + * Save info on "old" attribute for "atomic rename" ops, leaf_add() + * modifies the index/blkno/rmtblk/rmtblkcnt fields to show the + * "new" attrs info. Will need the "old" info to remove it later. + * + * Insert the "new" entry in the correct block. + */ + if (state->inleaf) { + trace_xfs_attr_leaf_add_old(state->args); + error = xfs_attr3_leaf_add(oldblk->bp, state->args); + } else { + trace_xfs_attr_leaf_add_new(state->args); + error = xfs_attr3_leaf_add(newblk->bp, state->args); + } + + /* + * Update last hashval in each block since we added the name. + */ + oldblk->hashval = xfs_attr_leaf_lasthash(oldblk->bp, NULL); + newblk->hashval = xfs_attr_leaf_lasthash(newblk->bp, NULL); + return error; +} + +/* + * Add a name to the leaf attribute list structure. + */ +int +xfs_attr3_leaf_add( + struct xfs_buf *bp, + struct xfs_da_args *args) +{ + struct xfs_attr_leafblock *leaf; + struct xfs_attr3_icleaf_hdr ichdr; + int tablesize; + int entsize; + int sum; + int tmp; + int i; + + trace_xfs_attr_leaf_add(args); + + leaf = bp->b_addr; + xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf); + ASSERT(args->index >= 0 && args->index <= ichdr.count); + entsize = xfs_attr_leaf_newentsize(args, NULL); + + /* + * Search through freemap for first-fit on new name length. + * (may need to figure in size of entry struct too) + */ + tablesize = (ichdr.count + 1) * sizeof(xfs_attr_leaf_entry_t) + + xfs_attr3_leaf_hdr_size(leaf); + for (sum = 0, i = XFS_ATTR_LEAF_MAPSIZE - 1; i >= 0; i--) { + if (tablesize > ichdr.firstused) { + sum += ichdr.freemap[i].size; + continue; + } + if (!ichdr.freemap[i].size) + continue; /* no space in this map */ + tmp = entsize; + if (ichdr.freemap[i].base < ichdr.firstused) + tmp += sizeof(xfs_attr_leaf_entry_t); + if (ichdr.freemap[i].size >= tmp) { + tmp = xfs_attr3_leaf_add_work(bp, &ichdr, args, i); + goto out_log_hdr; + } + sum += ichdr.freemap[i].size; + } + + /* + * If there are no holes in the address space of the block, + * and we don't have enough freespace, then compaction will do us + * no good and we should just give up. + */ + if (!ichdr.holes && sum < entsize) + return -ENOSPC; + + /* + * Compact the entries to coalesce free space. + * This may change the hdr->count via dropping INCOMPLETE entries. + */ + xfs_attr3_leaf_compact(args, &ichdr, bp); + + /* + * After compaction, the block is guaranteed to have only one + * free region, in freemap[0]. If it is not big enough, give up. + */ + if (ichdr.freemap[0].size < (entsize + sizeof(xfs_attr_leaf_entry_t))) { + tmp = -ENOSPC; + goto out_log_hdr; + } + + tmp = xfs_attr3_leaf_add_work(bp, &ichdr, args, 0); + +out_log_hdr: + xfs_attr3_leaf_hdr_to_disk(args->geo, leaf, &ichdr); + xfs_trans_log_buf(args->trans, bp, + XFS_DA_LOGRANGE(leaf, &leaf->hdr, + xfs_attr3_leaf_hdr_size(leaf))); + return tmp; +} + +/* + * Add a name to a leaf attribute list structure. + */ +STATIC int +xfs_attr3_leaf_add_work( + struct xfs_buf *bp, + struct xfs_attr3_icleaf_hdr *ichdr, + struct xfs_da_args *args, + int mapindex) +{ + struct xfs_attr_leafblock *leaf; + struct xfs_attr_leaf_entry *entry; + struct xfs_attr_leaf_name_local *name_loc; + struct xfs_attr_leaf_name_remote *name_rmt; + struct xfs_mount *mp; + int tmp; + int i; + + trace_xfs_attr_leaf_add_work(args); + + leaf = bp->b_addr; + ASSERT(mapindex >= 0 && mapindex < XFS_ATTR_LEAF_MAPSIZE); + ASSERT(args->index >= 0 && args->index <= ichdr->count); + + /* + * Force open some space in the entry array and fill it in. + */ + entry = &xfs_attr3_leaf_entryp(leaf)[args->index]; + if (args->index < ichdr->count) { + tmp = ichdr->count - args->index; + tmp *= sizeof(xfs_attr_leaf_entry_t); + memmove(entry + 1, entry, tmp); + xfs_trans_log_buf(args->trans, bp, + XFS_DA_LOGRANGE(leaf, entry, tmp + sizeof(*entry))); + } + ichdr->count++; + + /* + * Allocate space for the new string (at the end of the run). + */ + mp = args->trans->t_mountp; + ASSERT(ichdr->freemap[mapindex].base < args->geo->blksize); + ASSERT((ichdr->freemap[mapindex].base & 0x3) == 0); + ASSERT(ichdr->freemap[mapindex].size >= + xfs_attr_leaf_newentsize(args, NULL)); + ASSERT(ichdr->freemap[mapindex].size < args->geo->blksize); + ASSERT((ichdr->freemap[mapindex].size & 0x3) == 0); + + ichdr->freemap[mapindex].size -= xfs_attr_leaf_newentsize(args, &tmp); + + entry->nameidx = cpu_to_be16(ichdr->freemap[mapindex].base + + ichdr->freemap[mapindex].size); + entry->hashval = cpu_to_be32(args->hashval); + entry->flags = tmp ? XFS_ATTR_LOCAL : 0; + entry->flags |= XFS_ATTR_NSP_ARGS_TO_ONDISK(args->flags); + if (args->op_flags & XFS_DA_OP_RENAME) { + entry->flags |= XFS_ATTR_INCOMPLETE; + if ((args->blkno2 == args->blkno) && + (args->index2 <= args->index)) { + args->index2++; + } + } + xfs_trans_log_buf(args->trans, bp, + XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry))); + ASSERT((args->index == 0) || + (be32_to_cpu(entry->hashval) >= be32_to_cpu((entry-1)->hashval))); + ASSERT((args->index == ichdr->count - 1) || + (be32_to_cpu(entry->hashval) <= be32_to_cpu((entry+1)->hashval))); + + /* + * For "remote" attribute values, simply note that we need to + * allocate space for the "remote" value. We can't actually + * allocate the extents in this transaction, and we can't decide + * which blocks they should be as we might allocate more blocks + * as part of this transaction (a split operation for example). + */ + if (entry->flags & XFS_ATTR_LOCAL) { + name_loc = xfs_attr3_leaf_name_local(leaf, args->index); + name_loc->namelen = args->namelen; + name_loc->valuelen = cpu_to_be16(args->valuelen); + memcpy((char *)name_loc->nameval, args->name, args->namelen); + memcpy((char *)&name_loc->nameval[args->namelen], args->value, + be16_to_cpu(name_loc->valuelen)); + } else { + name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index); + name_rmt->namelen = args->namelen; + memcpy((char *)name_rmt->name, args->name, args->namelen); + entry->flags |= XFS_ATTR_INCOMPLETE; + /* just in case */ + name_rmt->valuelen = 0; + name_rmt->valueblk = 0; + args->rmtblkno = 1; + args->rmtblkcnt = xfs_attr3_rmt_blocks(mp, args->valuelen); + args->rmtvaluelen = args->valuelen; + } + xfs_trans_log_buf(args->trans, bp, + XFS_DA_LOGRANGE(leaf, xfs_attr3_leaf_name(leaf, args->index), + xfs_attr_leaf_entsize(leaf, args->index))); + + /* + * Update the control info for this leaf node + */ + if (be16_to_cpu(entry->nameidx) < ichdr->firstused) + ichdr->firstused = be16_to_cpu(entry->nameidx); + + ASSERT(ichdr->firstused >= ichdr->count * sizeof(xfs_attr_leaf_entry_t) + + xfs_attr3_leaf_hdr_size(leaf)); + tmp = (ichdr->count - 1) * sizeof(xfs_attr_leaf_entry_t) + + xfs_attr3_leaf_hdr_size(leaf); + + for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) { + if (ichdr->freemap[i].base == tmp) { + ichdr->freemap[i].base += sizeof(xfs_attr_leaf_entry_t); + ichdr->freemap[i].size -= sizeof(xfs_attr_leaf_entry_t); + } + } + ichdr->usedbytes += xfs_attr_leaf_entsize(leaf, args->index); + return 0; +} + +/* + * Garbage collect a leaf attribute list block by copying it to a new buffer. + */ +STATIC void +xfs_attr3_leaf_compact( + struct xfs_da_args *args, + struct xfs_attr3_icleaf_hdr *ichdr_dst, + struct xfs_buf *bp) +{ + struct xfs_attr_leafblock *leaf_src; + struct xfs_attr_leafblock *leaf_dst; + struct xfs_attr3_icleaf_hdr ichdr_src; + struct xfs_trans *trans = args->trans; + char *tmpbuffer; + + trace_xfs_attr_leaf_compact(args); + + tmpbuffer = kmem_alloc(args->geo->blksize, KM_SLEEP); + memcpy(tmpbuffer, bp->b_addr, args->geo->blksize); + memset(bp->b_addr, 0, args->geo->blksize); + leaf_src = (xfs_attr_leafblock_t *)tmpbuffer; + leaf_dst = bp->b_addr; + + /* + * Copy the on-disk header back into the destination buffer to ensure + * all the information in the header that is not part of the incore + * header structure is preserved. + */ + memcpy(bp->b_addr, tmpbuffer, xfs_attr3_leaf_hdr_size(leaf_src)); + + /* Initialise the incore headers */ + ichdr_src = *ichdr_dst; /* struct copy */ + ichdr_dst->firstused = args->geo->blksize; + ichdr_dst->usedbytes = 0; + ichdr_dst->count = 0; + ichdr_dst->holes = 0; + ichdr_dst->freemap[0].base = xfs_attr3_leaf_hdr_size(leaf_src); + ichdr_dst->freemap[0].size = ichdr_dst->firstused - + ichdr_dst->freemap[0].base; + + /* write the header back to initialise the underlying buffer */ + xfs_attr3_leaf_hdr_to_disk(args->geo, leaf_dst, ichdr_dst); + + /* + * Copy all entry's in the same (sorted) order, + * but allocate name/value pairs packed and in sequence. + */ + xfs_attr3_leaf_moveents(args, leaf_src, &ichdr_src, 0, + leaf_dst, ichdr_dst, 0, ichdr_src.count); + /* + * this logs the entire buffer, but the caller must write the header + * back to the buffer when it is finished modifying it. + */ + xfs_trans_log_buf(trans, bp, 0, args->geo->blksize - 1); + + kmem_free(tmpbuffer); +} + +/* + * Compare two leaf blocks "order". + * Return 0 unless leaf2 should go before leaf1. + */ +static int +xfs_attr3_leaf_order( + struct xfs_buf *leaf1_bp, + struct xfs_attr3_icleaf_hdr *leaf1hdr, + struct xfs_buf *leaf2_bp, + struct xfs_attr3_icleaf_hdr *leaf2hdr) +{ + struct xfs_attr_leaf_entry *entries1; + struct xfs_attr_leaf_entry *entries2; + + entries1 = xfs_attr3_leaf_entryp(leaf1_bp->b_addr); + entries2 = xfs_attr3_leaf_entryp(leaf2_bp->b_addr); + if (leaf1hdr->count > 0 && leaf2hdr->count > 0 && + ((be32_to_cpu(entries2[0].hashval) < + be32_to_cpu(entries1[0].hashval)) || + (be32_to_cpu(entries2[leaf2hdr->count - 1].hashval) < + be32_to_cpu(entries1[leaf1hdr->count - 1].hashval)))) { + return 1; + } + return 0; +} + +int +xfs_attr_leaf_order( + struct xfs_buf *leaf1_bp, + struct xfs_buf *leaf2_bp) +{ + struct xfs_attr3_icleaf_hdr ichdr1; + struct xfs_attr3_icleaf_hdr ichdr2; + struct xfs_mount *mp = leaf1_bp->b_target->bt_mount; + + xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr1, leaf1_bp->b_addr); + xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr2, leaf2_bp->b_addr); + return xfs_attr3_leaf_order(leaf1_bp, &ichdr1, leaf2_bp, &ichdr2); +} + +/* + * Redistribute the attribute list entries between two leaf nodes, + * taking into account the size of the new entry. + * + * NOTE: if new block is empty, then it will get the upper half of the + * old block. At present, all (one) callers pass in an empty second block. + * + * This code adjusts the args->index/blkno and args->index2/blkno2 fields + * to match what it is doing in splitting the attribute leaf block. Those + * values are used in "atomic rename" operations on attributes. Note that + * the "new" and "old" values can end up in different blocks. + */ +STATIC void +xfs_attr3_leaf_rebalance( + struct xfs_da_state *state, + struct xfs_da_state_blk *blk1, + struct xfs_da_state_blk *blk2) +{ + struct xfs_da_args *args; + struct xfs_attr_leafblock *leaf1; + struct xfs_attr_leafblock *leaf2; + struct xfs_attr3_icleaf_hdr ichdr1; + struct xfs_attr3_icleaf_hdr ichdr2; + struct xfs_attr_leaf_entry *entries1; + struct xfs_attr_leaf_entry *entries2; + int count; + int totallen; + int max; + int space; + int swap; + + /* + * Set up environment. + */ + ASSERT(blk1->magic == XFS_ATTR_LEAF_MAGIC); + ASSERT(blk2->magic == XFS_ATTR_LEAF_MAGIC); + leaf1 = blk1->bp->b_addr; + leaf2 = blk2->bp->b_addr; + xfs_attr3_leaf_hdr_from_disk(state->args->geo, &ichdr1, leaf1); + xfs_attr3_leaf_hdr_from_disk(state->args->geo, &ichdr2, leaf2); + ASSERT(ichdr2.count == 0); + args = state->args; + + trace_xfs_attr_leaf_rebalance(args); + + /* + * Check ordering of blocks, reverse if it makes things simpler. + * + * NOTE: Given that all (current) callers pass in an empty + * second block, this code should never set "swap". + */ + swap = 0; + if (xfs_attr3_leaf_order(blk1->bp, &ichdr1, blk2->bp, &ichdr2)) { + struct xfs_da_state_blk *tmp_blk; + struct xfs_attr3_icleaf_hdr tmp_ichdr; + + tmp_blk = blk1; + blk1 = blk2; + blk2 = tmp_blk; + + /* struct copies to swap them rather than reconverting */ + tmp_ichdr = ichdr1; + ichdr1 = ichdr2; + ichdr2 = tmp_ichdr; + + leaf1 = blk1->bp->b_addr; + leaf2 = blk2->bp->b_addr; + swap = 1; + } + + /* + * Examine entries until we reduce the absolute difference in + * byte usage between the two blocks to a minimum. Then get + * the direction to copy and the number of elements to move. + * + * "inleaf" is true if the new entry should be inserted into blk1. + * If "swap" is also true, then reverse the sense of "inleaf". + */ + state->inleaf = xfs_attr3_leaf_figure_balance(state, blk1, &ichdr1, + blk2, &ichdr2, + &count, &totallen); + if (swap) + state->inleaf = !state->inleaf; + + /* + * Move any entries required from leaf to leaf: + */ + if (count < ichdr1.count) { + /* + * Figure the total bytes to be added to the destination leaf. + */ + /* number entries being moved */ + count = ichdr1.count - count; + space = ichdr1.usedbytes - totallen; + space += count * sizeof(xfs_attr_leaf_entry_t); + + /* + * leaf2 is the destination, compact it if it looks tight. + */ + max = ichdr2.firstused - xfs_attr3_leaf_hdr_size(leaf1); + max -= ichdr2.count * sizeof(xfs_attr_leaf_entry_t); + if (space > max) + xfs_attr3_leaf_compact(args, &ichdr2, blk2->bp); + + /* + * Move high entries from leaf1 to low end of leaf2. + */ + xfs_attr3_leaf_moveents(args, leaf1, &ichdr1, + ichdr1.count - count, leaf2, &ichdr2, 0, count); + + } else if (count > ichdr1.count) { + /* + * I assert that since all callers pass in an empty + * second buffer, this code should never execute. + */ + ASSERT(0); + + /* + * Figure the total bytes to be added to the destination leaf. + */ + /* number entries being moved */ + count -= ichdr1.count; + space = totallen - ichdr1.usedbytes; + space += count * sizeof(xfs_attr_leaf_entry_t); + + /* + * leaf1 is the destination, compact it if it looks tight. + */ + max = ichdr1.firstused - xfs_attr3_leaf_hdr_size(leaf1); + max -= ichdr1.count * sizeof(xfs_attr_leaf_entry_t); + if (space > max) + xfs_attr3_leaf_compact(args, &ichdr1, blk1->bp); + + /* + * Move low entries from leaf2 to high end of leaf1. + */ + xfs_attr3_leaf_moveents(args, leaf2, &ichdr2, 0, leaf1, &ichdr1, + ichdr1.count, count); + } + + xfs_attr3_leaf_hdr_to_disk(state->args->geo, leaf1, &ichdr1); + xfs_attr3_leaf_hdr_to_disk(state->args->geo, leaf2, &ichdr2); + xfs_trans_log_buf(args->trans, blk1->bp, 0, args->geo->blksize - 1); + xfs_trans_log_buf(args->trans, blk2->bp, 0, args->geo->blksize - 1); + + /* + * Copy out last hashval in each block for B-tree code. + */ + entries1 = xfs_attr3_leaf_entryp(leaf1); + entries2 = xfs_attr3_leaf_entryp(leaf2); + blk1->hashval = be32_to_cpu(entries1[ichdr1.count - 1].hashval); + blk2->hashval = be32_to_cpu(entries2[ichdr2.count - 1].hashval); + + /* + * Adjust the expected index for insertion. + * NOTE: this code depends on the (current) situation that the + * second block was originally empty. + * + * If the insertion point moved to the 2nd block, we must adjust + * the index. We must also track the entry just following the + * new entry for use in an "atomic rename" operation, that entry + * is always the "old" entry and the "new" entry is what we are + * inserting. The index/blkno fields refer to the "old" entry, + * while the index2/blkno2 fields refer to the "new" entry. + */ + if (blk1->index > ichdr1.count) { + ASSERT(state->inleaf == 0); + blk2->index = blk1->index - ichdr1.count; + args->index = args->index2 = blk2->index; + args->blkno = args->blkno2 = blk2->blkno; + } else if (blk1->index == ichdr1.count) { + if (state->inleaf) { + args->index = blk1->index; + args->blkno = blk1->blkno; + args->index2 = 0; + args->blkno2 = blk2->blkno; + } else { + /* + * On a double leaf split, the original attr location + * is already stored in blkno2/index2, so don't + * overwrite it overwise we corrupt the tree. + */ + blk2->index = blk1->index - ichdr1.count; + args->index = blk2->index; + args->blkno = blk2->blkno; + if (!state->extravalid) { + /* + * set the new attr location to match the old + * one and let the higher level split code + * decide where in the leaf to place it. + */ + args->index2 = blk2->index; + args->blkno2 = blk2->blkno; + } + } + } else { + ASSERT(state->inleaf == 1); + args->index = args->index2 = blk1->index; + args->blkno = args->blkno2 = blk1->blkno; + } +} + +/* + * Examine entries until we reduce the absolute difference in + * byte usage between the two blocks to a minimum. + * GROT: Is this really necessary? With other than a 512 byte blocksize, + * GROT: there will always be enough room in either block for a new entry. + * GROT: Do a double-split for this case? + */ +STATIC int +xfs_attr3_leaf_figure_balance( + struct xfs_da_state *state, + struct xfs_da_state_blk *blk1, + struct xfs_attr3_icleaf_hdr *ichdr1, + struct xfs_da_state_blk *blk2, + struct xfs_attr3_icleaf_hdr *ichdr2, + int *countarg, + int *usedbytesarg) +{ + struct xfs_attr_leafblock *leaf1 = blk1->bp->b_addr; + struct xfs_attr_leafblock *leaf2 = blk2->bp->b_addr; + struct xfs_attr_leaf_entry *entry; + int count; + int max; + int index; + int totallen = 0; + int half; + int lastdelta; + int foundit = 0; + int tmp; + + /* + * Examine entries until we reduce the absolute difference in + * byte usage between the two blocks to a minimum. + */ + max = ichdr1->count + ichdr2->count; + half = (max + 1) * sizeof(*entry); + half += ichdr1->usedbytes + ichdr2->usedbytes + + xfs_attr_leaf_newentsize(state->args, NULL); + half /= 2; + lastdelta = state->args->geo->blksize; + entry = xfs_attr3_leaf_entryp(leaf1); + for (count = index = 0; count < max; entry++, index++, count++) { + +#define XFS_ATTR_ABS(A) (((A) < 0) ? -(A) : (A)) + /* + * The new entry is in the first block, account for it. + */ + if (count == blk1->index) { + tmp = totallen + sizeof(*entry) + + xfs_attr_leaf_newentsize(state->args, NULL); + if (XFS_ATTR_ABS(half - tmp) > lastdelta) + break; + lastdelta = XFS_ATTR_ABS(half - tmp); + totallen = tmp; + foundit = 1; + } + + /* + * Wrap around into the second block if necessary. + */ + if (count == ichdr1->count) { + leaf1 = leaf2; + entry = xfs_attr3_leaf_entryp(leaf1); + index = 0; + } + + /* + * Figure out if next leaf entry would be too much. + */ + tmp = totallen + sizeof(*entry) + xfs_attr_leaf_entsize(leaf1, + index); + if (XFS_ATTR_ABS(half - tmp) > lastdelta) + break; + lastdelta = XFS_ATTR_ABS(half - tmp); + totallen = tmp; +#undef XFS_ATTR_ABS + } + + /* + * Calculate the number of usedbytes that will end up in lower block. + * If new entry not in lower block, fix up the count. + */ + totallen -= count * sizeof(*entry); + if (foundit) { + totallen -= sizeof(*entry) + + xfs_attr_leaf_newentsize(state->args, NULL); + } + + *countarg = count; + *usedbytesarg = totallen; + return foundit; +} + +/*======================================================================== + * Routines used for shrinking the Btree. + *========================================================================*/ + +/* + * Check a leaf block and its neighbors to see if the block should be + * collapsed into one or the other neighbor. Always keep the block + * with the smaller block number. + * If the current block is over 50% full, don't try to join it, return 0. + * If the block is empty, fill in the state structure and return 2. + * If it can be collapsed, fill in the state structure and return 1. + * If nothing can be done, return 0. + * + * GROT: allow for INCOMPLETE entries in calculation. + */ +int +xfs_attr3_leaf_toosmall( + struct xfs_da_state *state, + int *action) +{ + struct xfs_attr_leafblock *leaf; + struct xfs_da_state_blk *blk; + struct xfs_attr3_icleaf_hdr ichdr; + struct xfs_buf *bp; + xfs_dablk_t blkno; + int bytes; + int forward; + int error; + int retval; + int i; + + trace_xfs_attr_leaf_toosmall(state->args); + + /* + * Check for the degenerate case of the block being over 50% full. + * If so, it's not worth even looking to see if we might be able + * to coalesce with a sibling. + */ + blk = &state->path.blk[ state->path.active-1 ]; + leaf = blk->bp->b_addr; + xfs_attr3_leaf_hdr_from_disk(state->args->geo, &ichdr, leaf); + bytes = xfs_attr3_leaf_hdr_size(leaf) + + ichdr.count * sizeof(xfs_attr_leaf_entry_t) + + ichdr.usedbytes; + if (bytes > (state->args->geo->blksize >> 1)) { + *action = 0; /* blk over 50%, don't try to join */ + return 0; + } + + /* + * Check for the degenerate case of the block being empty. + * If the block is empty, we'll simply delete it, no need to + * coalesce it with a sibling block. We choose (arbitrarily) + * to merge with the forward block unless it is NULL. + */ + if (ichdr.count == 0) { + /* + * Make altpath point to the block we want to keep and + * path point to the block we want to drop (this one). + */ + forward = (ichdr.forw != 0); + memcpy(&state->altpath, &state->path, sizeof(state->path)); + error = xfs_da3_path_shift(state, &state->altpath, forward, + 0, &retval); + if (error) + return error; + if (retval) { + *action = 0; + } else { + *action = 2; + } + return 0; + } + + /* + * Examine each sibling block to see if we can coalesce with + * at least 25% free space to spare. We need to figure out + * whether to merge with the forward or the backward block. + * We prefer coalescing with the lower numbered sibling so as + * to shrink an attribute list over time. + */ + /* start with smaller blk num */ + forward = ichdr.forw < ichdr.back; + for (i = 0; i < 2; forward = !forward, i++) { + struct xfs_attr3_icleaf_hdr ichdr2; + if (forward) + blkno = ichdr.forw; + else + blkno = ichdr.back; + if (blkno == 0) + continue; + error = xfs_attr3_leaf_read(state->args->trans, state->args->dp, + blkno, -1, &bp); + if (error) + return error; + + xfs_attr3_leaf_hdr_from_disk(state->args->geo, &ichdr2, bp->b_addr); + + bytes = state->args->geo->blksize - + (state->args->geo->blksize >> 2) - + ichdr.usedbytes - ichdr2.usedbytes - + ((ichdr.count + ichdr2.count) * + sizeof(xfs_attr_leaf_entry_t)) - + xfs_attr3_leaf_hdr_size(leaf); + + xfs_trans_brelse(state->args->trans, bp); + if (bytes >= 0) + break; /* fits with at least 25% to spare */ + } + if (i >= 2) { + *action = 0; + return 0; + } + + /* + * Make altpath point to the block we want to keep (the lower + * numbered block) and path point to the block we want to drop. + */ + memcpy(&state->altpath, &state->path, sizeof(state->path)); + if (blkno < blk->blkno) { + error = xfs_da3_path_shift(state, &state->altpath, forward, + 0, &retval); + } else { + error = xfs_da3_path_shift(state, &state->path, forward, + 0, &retval); + } + if (error) + return error; + if (retval) { + *action = 0; + } else { + *action = 1; + } + return 0; +} + +/* + * Remove a name from the leaf attribute list structure. + * + * Return 1 if leaf is less than 37% full, 0 if >= 37% full. + * If two leaves are 37% full, when combined they will leave 25% free. + */ +int +xfs_attr3_leaf_remove( + struct xfs_buf *bp, + struct xfs_da_args *args) +{ + struct xfs_attr_leafblock *leaf; + struct xfs_attr3_icleaf_hdr ichdr; + struct xfs_attr_leaf_entry *entry; + int before; + int after; + int smallest; + int entsize; + int tablesize; + int tmp; + int i; + + trace_xfs_attr_leaf_remove(args); + + leaf = bp->b_addr; + xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf); + + ASSERT(ichdr.count > 0 && ichdr.count < args->geo->blksize / 8); + ASSERT(args->index >= 0 && args->index < ichdr.count); + ASSERT(ichdr.firstused >= ichdr.count * sizeof(*entry) + + xfs_attr3_leaf_hdr_size(leaf)); + + entry = &xfs_attr3_leaf_entryp(leaf)[args->index]; + + ASSERT(be16_to_cpu(entry->nameidx) >= ichdr.firstused); + ASSERT(be16_to_cpu(entry->nameidx) < args->geo->blksize); + + /* + * Scan through free region table: + * check for adjacency of free'd entry with an existing one, + * find smallest free region in case we need to replace it, + * adjust any map that borders the entry table, + */ + tablesize = ichdr.count * sizeof(xfs_attr_leaf_entry_t) + + xfs_attr3_leaf_hdr_size(leaf); + tmp = ichdr.freemap[0].size; + before = after = -1; + smallest = XFS_ATTR_LEAF_MAPSIZE - 1; + entsize = xfs_attr_leaf_entsize(leaf, args->index); + for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) { + ASSERT(ichdr.freemap[i].base < args->geo->blksize); + ASSERT(ichdr.freemap[i].size < args->geo->blksize); + if (ichdr.freemap[i].base == tablesize) { + ichdr.freemap[i].base -= sizeof(xfs_attr_leaf_entry_t); + ichdr.freemap[i].size += sizeof(xfs_attr_leaf_entry_t); + } + + if (ichdr.freemap[i].base + ichdr.freemap[i].size == + be16_to_cpu(entry->nameidx)) { + before = i; + } else if (ichdr.freemap[i].base == + (be16_to_cpu(entry->nameidx) + entsize)) { + after = i; + } else if (ichdr.freemap[i].size < tmp) { + tmp = ichdr.freemap[i].size; + smallest = i; + } + } + + /* + * Coalesce adjacent freemap regions, + * or replace the smallest region. + */ + if ((before >= 0) || (after >= 0)) { + if ((before >= 0) && (after >= 0)) { + ichdr.freemap[before].size += entsize; + ichdr.freemap[before].size += ichdr.freemap[after].size; + ichdr.freemap[after].base = 0; + ichdr.freemap[after].size = 0; + } else if (before >= 0) { + ichdr.freemap[before].size += entsize; + } else { + ichdr.freemap[after].base = be16_to_cpu(entry->nameidx); + ichdr.freemap[after].size += entsize; + } + } else { + /* + * Replace smallest region (if it is smaller than free'd entry) + */ + if (ichdr.freemap[smallest].size < entsize) { + ichdr.freemap[smallest].base = be16_to_cpu(entry->nameidx); + ichdr.freemap[smallest].size = entsize; + } + } + + /* + * Did we remove the first entry? + */ + if (be16_to_cpu(entry->nameidx) == ichdr.firstused) + smallest = 1; + else + smallest = 0; + + /* + * Compress the remaining entries and zero out the removed stuff. + */ + memset(xfs_attr3_leaf_name(leaf, args->index), 0, entsize); + ichdr.usedbytes -= entsize; + xfs_trans_log_buf(args->trans, bp, + XFS_DA_LOGRANGE(leaf, xfs_attr3_leaf_name(leaf, args->index), + entsize)); + + tmp = (ichdr.count - args->index) * sizeof(xfs_attr_leaf_entry_t); + memmove(entry, entry + 1, tmp); + ichdr.count--; + xfs_trans_log_buf(args->trans, bp, + XFS_DA_LOGRANGE(leaf, entry, tmp + sizeof(xfs_attr_leaf_entry_t))); + + entry = &xfs_attr3_leaf_entryp(leaf)[ichdr.count]; + memset(entry, 0, sizeof(xfs_attr_leaf_entry_t)); + + /* + * If we removed the first entry, re-find the first used byte + * in the name area. Note that if the entry was the "firstused", + * then we don't have a "hole" in our block resulting from + * removing the name. + */ + if (smallest) { + tmp = args->geo->blksize; + entry = xfs_attr3_leaf_entryp(leaf); + for (i = ichdr.count - 1; i >= 0; entry++, i--) { + ASSERT(be16_to_cpu(entry->nameidx) >= ichdr.firstused); + ASSERT(be16_to_cpu(entry->nameidx) < args->geo->blksize); + + if (be16_to_cpu(entry->nameidx) < tmp) + tmp = be16_to_cpu(entry->nameidx); + } + ichdr.firstused = tmp; + ASSERT(ichdr.firstused != 0); + } else { + ichdr.holes = 1; /* mark as needing compaction */ + } + xfs_attr3_leaf_hdr_to_disk(args->geo, leaf, &ichdr); + xfs_trans_log_buf(args->trans, bp, + XFS_DA_LOGRANGE(leaf, &leaf->hdr, + xfs_attr3_leaf_hdr_size(leaf))); + + /* + * Check if leaf is less than 50% full, caller may want to + * "join" the leaf with a sibling if so. + */ + tmp = ichdr.usedbytes + xfs_attr3_leaf_hdr_size(leaf) + + ichdr.count * sizeof(xfs_attr_leaf_entry_t); + + return tmp < args->geo->magicpct; /* leaf is < 37% full */ +} + +/* + * Move all the attribute list entries from drop_leaf into save_leaf. + */ +void +xfs_attr3_leaf_unbalance( + struct xfs_da_state *state, + struct xfs_da_state_blk *drop_blk, + struct xfs_da_state_blk *save_blk) +{ + struct xfs_attr_leafblock *drop_leaf = drop_blk->bp->b_addr; + struct xfs_attr_leafblock *save_leaf = save_blk->bp->b_addr; + struct xfs_attr3_icleaf_hdr drophdr; + struct xfs_attr3_icleaf_hdr savehdr; + struct xfs_attr_leaf_entry *entry; + + trace_xfs_attr_leaf_unbalance(state->args); + + drop_leaf = drop_blk->bp->b_addr; + save_leaf = save_blk->bp->b_addr; + xfs_attr3_leaf_hdr_from_disk(state->args->geo, &drophdr, drop_leaf); + xfs_attr3_leaf_hdr_from_disk(state->args->geo, &savehdr, save_leaf); + entry = xfs_attr3_leaf_entryp(drop_leaf); + + /* + * Save last hashval from dying block for later Btree fixup. + */ + drop_blk->hashval = be32_to_cpu(entry[drophdr.count - 1].hashval); + + /* + * Check if we need a temp buffer, or can we do it in place. + * Note that we don't check "leaf" for holes because we will + * always be dropping it, toosmall() decided that for us already. + */ + if (savehdr.holes == 0) { + /* + * dest leaf has no holes, so we add there. May need + * to make some room in the entry array. + */ + if (xfs_attr3_leaf_order(save_blk->bp, &savehdr, + drop_blk->bp, &drophdr)) { + xfs_attr3_leaf_moveents(state->args, + drop_leaf, &drophdr, 0, + save_leaf, &savehdr, 0, + drophdr.count); + } else { + xfs_attr3_leaf_moveents(state->args, + drop_leaf, &drophdr, 0, + save_leaf, &savehdr, + savehdr.count, drophdr.count); + } + } else { + /* + * Destination has holes, so we make a temporary copy + * of the leaf and add them both to that. + */ + struct xfs_attr_leafblock *tmp_leaf; + struct xfs_attr3_icleaf_hdr tmphdr; + + tmp_leaf = kmem_zalloc(state->args->geo->blksize, KM_SLEEP); + + /* + * Copy the header into the temp leaf so that all the stuff + * not in the incore header is present and gets copied back in + * once we've moved all the entries. + */ + memcpy(tmp_leaf, save_leaf, xfs_attr3_leaf_hdr_size(save_leaf)); + + memset(&tmphdr, 0, sizeof(tmphdr)); + tmphdr.magic = savehdr.magic; + tmphdr.forw = savehdr.forw; + tmphdr.back = savehdr.back; + tmphdr.firstused = state->args->geo->blksize; + + /* write the header to the temp buffer to initialise it */ + xfs_attr3_leaf_hdr_to_disk(state->args->geo, tmp_leaf, &tmphdr); + + if (xfs_attr3_leaf_order(save_blk->bp, &savehdr, + drop_blk->bp, &drophdr)) { + xfs_attr3_leaf_moveents(state->args, + drop_leaf, &drophdr, 0, + tmp_leaf, &tmphdr, 0, + drophdr.count); + xfs_attr3_leaf_moveents(state->args, + save_leaf, &savehdr, 0, + tmp_leaf, &tmphdr, tmphdr.count, + savehdr.count); + } else { + xfs_attr3_leaf_moveents(state->args, + save_leaf, &savehdr, 0, + tmp_leaf, &tmphdr, 0, + savehdr.count); + xfs_attr3_leaf_moveents(state->args, + drop_leaf, &drophdr, 0, + tmp_leaf, &tmphdr, tmphdr.count, + drophdr.count); + } + memcpy(save_leaf, tmp_leaf, state->args->geo->blksize); + savehdr = tmphdr; /* struct copy */ + kmem_free(tmp_leaf); + } + + xfs_attr3_leaf_hdr_to_disk(state->args->geo, save_leaf, &savehdr); + xfs_trans_log_buf(state->args->trans, save_blk->bp, 0, + state->args->geo->blksize - 1); + + /* + * Copy out last hashval in each block for B-tree code. + */ + entry = xfs_attr3_leaf_entryp(save_leaf); + save_blk->hashval = be32_to_cpu(entry[savehdr.count - 1].hashval); +} + +/*======================================================================== + * Routines used for finding things in the Btree. + *========================================================================*/ + +/* + * Look up a name in a leaf attribute list structure. + * This is the internal routine, it uses the caller's buffer. + * + * Note that duplicate keys are allowed, but only check within the + * current leaf node. The Btree code must check in adjacent leaf nodes. + * + * Return in args->index the index into the entry[] array of either + * the found entry, or where the entry should have been (insert before + * that entry). + * + * Don't change the args->value unless we find the attribute. + */ +int +xfs_attr3_leaf_lookup_int( + struct xfs_buf *bp, + struct xfs_da_args *args) +{ + struct xfs_attr_leafblock *leaf; + struct xfs_attr3_icleaf_hdr ichdr; + struct xfs_attr_leaf_entry *entry; + struct xfs_attr_leaf_entry *entries; + struct xfs_attr_leaf_name_local *name_loc; + struct xfs_attr_leaf_name_remote *name_rmt; + xfs_dahash_t hashval; + int probe; + int span; + + trace_xfs_attr_leaf_lookup(args); + + leaf = bp->b_addr; + xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf); + entries = xfs_attr3_leaf_entryp(leaf); + ASSERT(ichdr.count < args->geo->blksize / 8); + + /* + * Binary search. (note: small blocks will skip this loop) + */ + hashval = args->hashval; + probe = span = ichdr.count / 2; + for (entry = &entries[probe]; span > 4; entry = &entries[probe]) { + span /= 2; + if (be32_to_cpu(entry->hashval) < hashval) + probe += span; + else if (be32_to_cpu(entry->hashval) > hashval) + probe -= span; + else + break; + } + ASSERT(probe >= 0 && (!ichdr.count || probe < ichdr.count)); + ASSERT(span <= 4 || be32_to_cpu(entry->hashval) == hashval); + + /* + * Since we may have duplicate hashval's, find the first matching + * hashval in the leaf. + */ + while (probe > 0 && be32_to_cpu(entry->hashval) >= hashval) { + entry--; + probe--; + } + while (probe < ichdr.count && + be32_to_cpu(entry->hashval) < hashval) { + entry++; + probe++; + } + if (probe == ichdr.count || be32_to_cpu(entry->hashval) != hashval) { + args->index = probe; + return -ENOATTR; + } + + /* + * Duplicate keys may be present, so search all of them for a match. + */ + for (; probe < ichdr.count && (be32_to_cpu(entry->hashval) == hashval); + entry++, probe++) { +/* + * GROT: Add code to remove incomplete entries. + */ + /* + * If we are looking for INCOMPLETE entries, show only those. + * If we are looking for complete entries, show only those. + */ + if ((args->flags & XFS_ATTR_INCOMPLETE) != + (entry->flags & XFS_ATTR_INCOMPLETE)) { + continue; + } + if (entry->flags & XFS_ATTR_LOCAL) { + name_loc = xfs_attr3_leaf_name_local(leaf, probe); + if (name_loc->namelen != args->namelen) + continue; + if (memcmp(args->name, name_loc->nameval, + args->namelen) != 0) + continue; + if (!xfs_attr_namesp_match(args->flags, entry->flags)) + continue; + args->index = probe; + return -EEXIST; + } else { + name_rmt = xfs_attr3_leaf_name_remote(leaf, probe); + if (name_rmt->namelen != args->namelen) + continue; + if (memcmp(args->name, name_rmt->name, + args->namelen) != 0) + continue; + if (!xfs_attr_namesp_match(args->flags, entry->flags)) + continue; + args->index = probe; + args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen); + args->rmtblkno = be32_to_cpu(name_rmt->valueblk); + args->rmtblkcnt = xfs_attr3_rmt_blocks( + args->dp->i_mount, + args->rmtvaluelen); + return -EEXIST; + } + } + args->index = probe; + return -ENOATTR; +} + +/* + * Get the value associated with an attribute name from a leaf attribute + * list structure. + */ +int +xfs_attr3_leaf_getvalue( + struct xfs_buf *bp, + struct xfs_da_args *args) +{ + struct xfs_attr_leafblock *leaf; + struct xfs_attr3_icleaf_hdr ichdr; + struct xfs_attr_leaf_entry *entry; + struct xfs_attr_leaf_name_local *name_loc; + struct xfs_attr_leaf_name_remote *name_rmt; + int valuelen; + + leaf = bp->b_addr; + xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf); + ASSERT(ichdr.count < args->geo->blksize / 8); + ASSERT(args->index < ichdr.count); + + entry = &xfs_attr3_leaf_entryp(leaf)[args->index]; + if (entry->flags & XFS_ATTR_LOCAL) { + name_loc = xfs_attr3_leaf_name_local(leaf, args->index); + ASSERT(name_loc->namelen == args->namelen); + ASSERT(memcmp(args->name, name_loc->nameval, args->namelen) == 0); + valuelen = be16_to_cpu(name_loc->valuelen); + if (args->flags & ATTR_KERNOVAL) { + args->valuelen = valuelen; + return 0; + } + if (args->valuelen < valuelen) { + args->valuelen = valuelen; + return -ERANGE; + } + args->valuelen = valuelen; + memcpy(args->value, &name_loc->nameval[args->namelen], valuelen); + } else { + name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index); + ASSERT(name_rmt->namelen == args->namelen); + ASSERT(memcmp(args->name, name_rmt->name, args->namelen) == 0); + args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen); + args->rmtblkno = be32_to_cpu(name_rmt->valueblk); + args->rmtblkcnt = xfs_attr3_rmt_blocks(args->dp->i_mount, + args->rmtvaluelen); + if (args->flags & ATTR_KERNOVAL) { + args->valuelen = args->rmtvaluelen; + return 0; + } + if (args->valuelen < args->rmtvaluelen) { + args->valuelen = args->rmtvaluelen; + return -ERANGE; + } + args->valuelen = args->rmtvaluelen; + } + return 0; +} + +/*======================================================================== + * Utility routines. + *========================================================================*/ + +/* + * Move the indicated entries from one leaf to another. + * NOTE: this routine modifies both source and destination leaves. + */ +/*ARGSUSED*/ +STATIC void +xfs_attr3_leaf_moveents( + struct xfs_da_args *args, + struct xfs_attr_leafblock *leaf_s, + struct xfs_attr3_icleaf_hdr *ichdr_s, + int start_s, + struct xfs_attr_leafblock *leaf_d, + struct xfs_attr3_icleaf_hdr *ichdr_d, + int start_d, + int count) +{ + struct xfs_attr_leaf_entry *entry_s; + struct xfs_attr_leaf_entry *entry_d; + int desti; + int tmp; + int i; + + /* + * Check for nothing to do. + */ + if (count == 0) + return; + + /* + * Set up environment. + */ + ASSERT(ichdr_s->magic == XFS_ATTR_LEAF_MAGIC || + ichdr_s->magic == XFS_ATTR3_LEAF_MAGIC); + ASSERT(ichdr_s->magic == ichdr_d->magic); + ASSERT(ichdr_s->count > 0 && ichdr_s->count < args->geo->blksize / 8); + ASSERT(ichdr_s->firstused >= (ichdr_s->count * sizeof(*entry_s)) + + xfs_attr3_leaf_hdr_size(leaf_s)); + ASSERT(ichdr_d->count < args->geo->blksize / 8); + ASSERT(ichdr_d->firstused >= (ichdr_d->count * sizeof(*entry_d)) + + xfs_attr3_leaf_hdr_size(leaf_d)); + + ASSERT(start_s < ichdr_s->count); + ASSERT(start_d <= ichdr_d->count); + ASSERT(count <= ichdr_s->count); + + + /* + * Move the entries in the destination leaf up to make a hole? + */ + if (start_d < ichdr_d->count) { + tmp = ichdr_d->count - start_d; + tmp *= sizeof(xfs_attr_leaf_entry_t); + entry_s = &xfs_attr3_leaf_entryp(leaf_d)[start_d]; + entry_d = &xfs_attr3_leaf_entryp(leaf_d)[start_d + count]; + memmove(entry_d, entry_s, tmp); + } + + /* + * Copy all entry's in the same (sorted) order, + * but allocate attribute info packed and in sequence. + */ + entry_s = &xfs_attr3_leaf_entryp(leaf_s)[start_s]; + entry_d = &xfs_attr3_leaf_entryp(leaf_d)[start_d]; + desti = start_d; + for (i = 0; i < count; entry_s++, entry_d++, desti++, i++) { + ASSERT(be16_to_cpu(entry_s->nameidx) >= ichdr_s->firstused); + tmp = xfs_attr_leaf_entsize(leaf_s, start_s + i); +#ifdef GROT + /* + * Code to drop INCOMPLETE entries. Difficult to use as we + * may also need to change the insertion index. Code turned + * off for 6.2, should be revisited later. + */ + if (entry_s->flags & XFS_ATTR_INCOMPLETE) { /* skip partials? */ + memset(xfs_attr3_leaf_name(leaf_s, start_s + i), 0, tmp); + ichdr_s->usedbytes -= tmp; + ichdr_s->count -= 1; + entry_d--; /* to compensate for ++ in loop hdr */ + desti--; + if ((start_s + i) < offset) + result++; /* insertion index adjustment */ + } else { +#endif /* GROT */ + ichdr_d->firstused -= tmp; + /* both on-disk, don't endian flip twice */ + entry_d->hashval = entry_s->hashval; + entry_d->nameidx = cpu_to_be16(ichdr_d->firstused); + entry_d->flags = entry_s->flags; + ASSERT(be16_to_cpu(entry_d->nameidx) + tmp + <= args->geo->blksize); + memmove(xfs_attr3_leaf_name(leaf_d, desti), + xfs_attr3_leaf_name(leaf_s, start_s + i), tmp); + ASSERT(be16_to_cpu(entry_s->nameidx) + tmp + <= args->geo->blksize); + memset(xfs_attr3_leaf_name(leaf_s, start_s + i), 0, tmp); + ichdr_s->usedbytes -= tmp; + ichdr_d->usedbytes += tmp; + ichdr_s->count -= 1; + ichdr_d->count += 1; + tmp = ichdr_d->count * sizeof(xfs_attr_leaf_entry_t) + + xfs_attr3_leaf_hdr_size(leaf_d); + ASSERT(ichdr_d->firstused >= tmp); +#ifdef GROT + } +#endif /* GROT */ + } + + /* + * Zero out the entries we just copied. + */ + if (start_s == ichdr_s->count) { + tmp = count * sizeof(xfs_attr_leaf_entry_t); + entry_s = &xfs_attr3_leaf_entryp(leaf_s)[start_s]; + ASSERT(((char *)entry_s + tmp) <= + ((char *)leaf_s + args->geo->blksize)); + memset(entry_s, 0, tmp); + } else { + /* + * Move the remaining entries down to fill the hole, + * then zero the entries at the top. + */ + tmp = (ichdr_s->count - count) * sizeof(xfs_attr_leaf_entry_t); + entry_s = &xfs_attr3_leaf_entryp(leaf_s)[start_s + count]; + entry_d = &xfs_attr3_leaf_entryp(leaf_s)[start_s]; + memmove(entry_d, entry_s, tmp); + + tmp = count * sizeof(xfs_attr_leaf_entry_t); + entry_s = &xfs_attr3_leaf_entryp(leaf_s)[ichdr_s->count]; + ASSERT(((char *)entry_s + tmp) <= + ((char *)leaf_s + args->geo->blksize)); + memset(entry_s, 0, tmp); + } + + /* + * Fill in the freemap information + */ + ichdr_d->freemap[0].base = xfs_attr3_leaf_hdr_size(leaf_d); + ichdr_d->freemap[0].base += ichdr_d->count * sizeof(xfs_attr_leaf_entry_t); + ichdr_d->freemap[0].size = ichdr_d->firstused - ichdr_d->freemap[0].base; + ichdr_d->freemap[1].base = 0; + ichdr_d->freemap[2].base = 0; + ichdr_d->freemap[1].size = 0; + ichdr_d->freemap[2].size = 0; + ichdr_s->holes = 1; /* leaf may not be compact */ +} + +/* + * Pick up the last hashvalue from a leaf block. + */ +xfs_dahash_t +xfs_attr_leaf_lasthash( + struct xfs_buf *bp, + int *count) +{ + struct xfs_attr3_icleaf_hdr ichdr; + struct xfs_attr_leaf_entry *entries; + struct xfs_mount *mp = bp->b_target->bt_mount; + + xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, bp->b_addr); + entries = xfs_attr3_leaf_entryp(bp->b_addr); + if (count) + *count = ichdr.count; + if (!ichdr.count) + return 0; + return be32_to_cpu(entries[ichdr.count - 1].hashval); +} + +/* + * Calculate the number of bytes used to store the indicated attribute + * (whether local or remote only calculate bytes in this block). + */ +STATIC int +xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index) +{ + struct xfs_attr_leaf_entry *entries; + xfs_attr_leaf_name_local_t *name_loc; + xfs_attr_leaf_name_remote_t *name_rmt; + int size; + + entries = xfs_attr3_leaf_entryp(leaf); + if (entries[index].flags & XFS_ATTR_LOCAL) { + name_loc = xfs_attr3_leaf_name_local(leaf, index); + size = xfs_attr_leaf_entsize_local(name_loc->namelen, + be16_to_cpu(name_loc->valuelen)); + } else { + name_rmt = xfs_attr3_leaf_name_remote(leaf, index); + size = xfs_attr_leaf_entsize_remote(name_rmt->namelen); + } + return size; +} + +/* + * Calculate the number of bytes that would be required to store the new + * attribute (whether local or remote only calculate bytes in this block). + * This routine decides as a side effect whether the attribute will be + * a "local" or a "remote" attribute. + */ +int +xfs_attr_leaf_newentsize( + struct xfs_da_args *args, + int *local) +{ + int size; + + size = xfs_attr_leaf_entsize_local(args->namelen, args->valuelen); + if (size < xfs_attr_leaf_entsize_local_max(args->geo->blksize)) { + if (local) + *local = 1; + return size; + } + if (local) + *local = 0; + return xfs_attr_leaf_entsize_remote(args->namelen); +} + + +/*======================================================================== + * Manage the INCOMPLETE flag in a leaf entry + *========================================================================*/ + +/* + * Clear the INCOMPLETE flag on an entry in a leaf block. + */ +int +xfs_attr3_leaf_clearflag( + struct xfs_da_args *args) +{ + struct xfs_attr_leafblock *leaf; + struct xfs_attr_leaf_entry *entry; + struct xfs_attr_leaf_name_remote *name_rmt; + struct xfs_buf *bp; + int error; +#ifdef DEBUG + struct xfs_attr3_icleaf_hdr ichdr; + xfs_attr_leaf_name_local_t *name_loc; + int namelen; + char *name; +#endif /* DEBUG */ + + trace_xfs_attr_leaf_clearflag(args); + /* + * Set up the operation. + */ + error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); + if (error) + return error; + + leaf = bp->b_addr; + entry = &xfs_attr3_leaf_entryp(leaf)[args->index]; + ASSERT(entry->flags & XFS_ATTR_INCOMPLETE); + +#ifdef DEBUG + xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf); + ASSERT(args->index < ichdr.count); + ASSERT(args->index >= 0); + + if (entry->flags & XFS_ATTR_LOCAL) { + name_loc = xfs_attr3_leaf_name_local(leaf, args->index); + namelen = name_loc->namelen; + name = (char *)name_loc->nameval; + } else { + name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index); + namelen = name_rmt->namelen; + name = (char *)name_rmt->name; + } + ASSERT(be32_to_cpu(entry->hashval) == args->hashval); + ASSERT(namelen == args->namelen); + ASSERT(memcmp(name, args->name, namelen) == 0); +#endif /* DEBUG */ + + entry->flags &= ~XFS_ATTR_INCOMPLETE; + xfs_trans_log_buf(args->trans, bp, + XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry))); + + if (args->rmtblkno) { + ASSERT((entry->flags & XFS_ATTR_LOCAL) == 0); + name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index); + name_rmt->valueblk = cpu_to_be32(args->rmtblkno); + name_rmt->valuelen = cpu_to_be32(args->rmtvaluelen); + xfs_trans_log_buf(args->trans, bp, + XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt))); + } + + /* + * Commit the flag value change and start the next trans in series. + */ + return xfs_trans_roll(&args->trans, args->dp); +} + +/* + * Set the INCOMPLETE flag on an entry in a leaf block. + */ +int +xfs_attr3_leaf_setflag( + struct xfs_da_args *args) +{ + struct xfs_attr_leafblock *leaf; + struct xfs_attr_leaf_entry *entry; + struct xfs_attr_leaf_name_remote *name_rmt; + struct xfs_buf *bp; + int error; +#ifdef DEBUG + struct xfs_attr3_icleaf_hdr ichdr; +#endif + + trace_xfs_attr_leaf_setflag(args); + + /* + * Set up the operation. + */ + error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); + if (error) + return error; + + leaf = bp->b_addr; +#ifdef DEBUG + xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf); + ASSERT(args->index < ichdr.count); + ASSERT(args->index >= 0); +#endif + entry = &xfs_attr3_leaf_entryp(leaf)[args->index]; + + ASSERT((entry->flags & XFS_ATTR_INCOMPLETE) == 0); + entry->flags |= XFS_ATTR_INCOMPLETE; + xfs_trans_log_buf(args->trans, bp, + XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry))); + if ((entry->flags & XFS_ATTR_LOCAL) == 0) { + name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index); + name_rmt->valueblk = 0; + name_rmt->valuelen = 0; + xfs_trans_log_buf(args->trans, bp, + XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt))); + } + + /* + * Commit the flag value change and start the next trans in series. + */ + return xfs_trans_roll(&args->trans, args->dp); +} + +/* + * In a single transaction, clear the INCOMPLETE flag on the leaf entry + * given by args->blkno/index and set the INCOMPLETE flag on the leaf + * entry given by args->blkno2/index2. + * + * Note that they could be in different blocks, or in the same block. + */ +int +xfs_attr3_leaf_flipflags( + struct xfs_da_args *args) +{ + struct xfs_attr_leafblock *leaf1; + struct xfs_attr_leafblock *leaf2; + struct xfs_attr_leaf_entry *entry1; + struct xfs_attr_leaf_entry *entry2; + struct xfs_attr_leaf_name_remote *name_rmt; + struct xfs_buf *bp1; + struct xfs_buf *bp2; + int error; +#ifdef DEBUG + struct xfs_attr3_icleaf_hdr ichdr1; + struct xfs_attr3_icleaf_hdr ichdr2; + xfs_attr_leaf_name_local_t *name_loc; + int namelen1, namelen2; + char *name1, *name2; +#endif /* DEBUG */ + + trace_xfs_attr_leaf_flipflags(args); + + /* + * Read the block containing the "old" attr + */ + error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp1); + if (error) + return error; + + /* + * Read the block containing the "new" attr, if it is different + */ + if (args->blkno2 != args->blkno) { + error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno2, + -1, &bp2); + if (error) + return error; + } else { + bp2 = bp1; + } + + leaf1 = bp1->b_addr; + entry1 = &xfs_attr3_leaf_entryp(leaf1)[args->index]; + + leaf2 = bp2->b_addr; + entry2 = &xfs_attr3_leaf_entryp(leaf2)[args->index2]; + +#ifdef DEBUG + xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr1, leaf1); + ASSERT(args->index < ichdr1.count); + ASSERT(args->index >= 0); + + xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr2, leaf2); + ASSERT(args->index2 < ichdr2.count); + ASSERT(args->index2 >= 0); + + if (entry1->flags & XFS_ATTR_LOCAL) { + name_loc = xfs_attr3_leaf_name_local(leaf1, args->index); + namelen1 = name_loc->namelen; + name1 = (char *)name_loc->nameval; + } else { + name_rmt = xfs_attr3_leaf_name_remote(leaf1, args->index); + namelen1 = name_rmt->namelen; + name1 = (char *)name_rmt->name; + } + if (entry2->flags & XFS_ATTR_LOCAL) { + name_loc = xfs_attr3_leaf_name_local(leaf2, args->index2); + namelen2 = name_loc->namelen; + name2 = (char *)name_loc->nameval; + } else { + name_rmt = xfs_attr3_leaf_name_remote(leaf2, args->index2); + namelen2 = name_rmt->namelen; + name2 = (char *)name_rmt->name; + } + ASSERT(be32_to_cpu(entry1->hashval) == be32_to_cpu(entry2->hashval)); + ASSERT(namelen1 == namelen2); + ASSERT(memcmp(name1, name2, namelen1) == 0); +#endif /* DEBUG */ + + ASSERT(entry1->flags & XFS_ATTR_INCOMPLETE); + ASSERT((entry2->flags & XFS_ATTR_INCOMPLETE) == 0); + + entry1->flags &= ~XFS_ATTR_INCOMPLETE; + xfs_trans_log_buf(args->trans, bp1, + XFS_DA_LOGRANGE(leaf1, entry1, sizeof(*entry1))); + if (args->rmtblkno) { + ASSERT((entry1->flags & XFS_ATTR_LOCAL) == 0); + name_rmt = xfs_attr3_leaf_name_remote(leaf1, args->index); + name_rmt->valueblk = cpu_to_be32(args->rmtblkno); + name_rmt->valuelen = cpu_to_be32(args->rmtvaluelen); + xfs_trans_log_buf(args->trans, bp1, + XFS_DA_LOGRANGE(leaf1, name_rmt, sizeof(*name_rmt))); + } + + entry2->flags |= XFS_ATTR_INCOMPLETE; + xfs_trans_log_buf(args->trans, bp2, + XFS_DA_LOGRANGE(leaf2, entry2, sizeof(*entry2))); + if ((entry2->flags & XFS_ATTR_LOCAL) == 0) { + name_rmt = xfs_attr3_leaf_name_remote(leaf2, args->index2); + name_rmt->valueblk = 0; + name_rmt->valuelen = 0; + xfs_trans_log_buf(args->trans, bp2, + XFS_DA_LOGRANGE(leaf2, name_rmt, sizeof(*name_rmt))); + } + + /* + * Commit the flag value change and start the next trans in series. + */ + error = xfs_trans_roll(&args->trans, args->dp); + + return error; +} diff --git a/kernel/fs/xfs/libxfs/xfs_attr_leaf.h b/kernel/fs/xfs/libxfs/xfs_attr_leaf.h new file mode 100644 index 000000000..882c8d338 --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_attr_leaf.h @@ -0,0 +1,110 @@ +/* + * Copyright (c) 2000,2002-2003,2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_ATTR_LEAF_H__ +#define __XFS_ATTR_LEAF_H__ + +struct attrlist; +struct attrlist_cursor_kern; +struct xfs_attr_list_context; +struct xfs_da_args; +struct xfs_da_state; +struct xfs_da_state_blk; +struct xfs_inode; +struct xfs_trans; + +/* + * Used to keep a list of "remote value" extents when unlinking an inode. + */ +typedef struct xfs_attr_inactive_list { + xfs_dablk_t valueblk; /* block number of value bytes */ + int valuelen; /* number of bytes in value */ +} xfs_attr_inactive_list_t; + + +/*======================================================================== + * Function prototypes for the kernel. + *========================================================================*/ + +/* + * Internal routines when attribute fork size < XFS_LITINO(mp). + */ +void xfs_attr_shortform_create(struct xfs_da_args *args); +void xfs_attr_shortform_add(struct xfs_da_args *args, int forkoff); +int xfs_attr_shortform_lookup(struct xfs_da_args *args); +int xfs_attr_shortform_getvalue(struct xfs_da_args *args); +int xfs_attr_shortform_to_leaf(struct xfs_da_args *args); +int xfs_attr_shortform_remove(struct xfs_da_args *args); +int xfs_attr_shortform_list(struct xfs_attr_list_context *context); +int xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode *dp); +int xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes); +void xfs_attr_fork_remove(struct xfs_inode *ip, struct xfs_trans *tp); + +/* + * Internal routines when attribute fork size == XFS_LBSIZE(mp). + */ +int xfs_attr3_leaf_to_node(struct xfs_da_args *args); +int xfs_attr3_leaf_to_shortform(struct xfs_buf *bp, + struct xfs_da_args *args, int forkoff); +int xfs_attr3_leaf_clearflag(struct xfs_da_args *args); +int xfs_attr3_leaf_setflag(struct xfs_da_args *args); +int xfs_attr3_leaf_flipflags(struct xfs_da_args *args); + +/* + * Routines used for growing the Btree. + */ +int xfs_attr3_leaf_split(struct xfs_da_state *state, + struct xfs_da_state_blk *oldblk, + struct xfs_da_state_blk *newblk); +int xfs_attr3_leaf_lookup_int(struct xfs_buf *leaf, + struct xfs_da_args *args); +int xfs_attr3_leaf_getvalue(struct xfs_buf *bp, struct xfs_da_args *args); +int xfs_attr3_leaf_add(struct xfs_buf *leaf_buffer, + struct xfs_da_args *args); +int xfs_attr3_leaf_remove(struct xfs_buf *leaf_buffer, + struct xfs_da_args *args); +int xfs_attr3_leaf_list_int(struct xfs_buf *bp, + struct xfs_attr_list_context *context); + +/* + * Routines used for shrinking the Btree. + */ +int xfs_attr3_leaf_toosmall(struct xfs_da_state *state, int *retval); +void xfs_attr3_leaf_unbalance(struct xfs_da_state *state, + struct xfs_da_state_blk *drop_blk, + struct xfs_da_state_blk *save_blk); +int xfs_attr3_root_inactive(struct xfs_trans **trans, struct xfs_inode *dp); + +/* + * Utility routines. + */ +xfs_dahash_t xfs_attr_leaf_lasthash(struct xfs_buf *bp, int *count); +int xfs_attr_leaf_order(struct xfs_buf *leaf1_bp, + struct xfs_buf *leaf2_bp); +int xfs_attr_leaf_newentsize(struct xfs_da_args *args, int *local); +int xfs_attr3_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp, + xfs_dablk_t bno, xfs_daddr_t mappedbno, + struct xfs_buf **bpp); +void xfs_attr3_leaf_hdr_from_disk(struct xfs_da_geometry *geo, + struct xfs_attr3_icleaf_hdr *to, + struct xfs_attr_leafblock *from); +void xfs_attr3_leaf_hdr_to_disk(struct xfs_da_geometry *geo, + struct xfs_attr_leafblock *to, + struct xfs_attr3_icleaf_hdr *from); + +#endif /* __XFS_ATTR_LEAF_H__ */ diff --git a/kernel/fs/xfs/libxfs/xfs_attr_remote.c b/kernel/fs/xfs/libxfs/xfs_attr_remote.c new file mode 100644 index 000000000..20de88d1b --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_attr_remote.c @@ -0,0 +1,626 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_inode.h" +#include "xfs_alloc.h" +#include "xfs_trans.h" +#include "xfs_inode_item.h" +#include "xfs_bmap.h" +#include "xfs_bmap_util.h" +#include "xfs_attr.h" +#include "xfs_attr_leaf.h" +#include "xfs_attr_remote.h" +#include "xfs_trans_space.h" +#include "xfs_trace.h" +#include "xfs_cksum.h" +#include "xfs_buf_item.h" +#include "xfs_error.h" + +#define ATTR_RMTVALUE_MAPSIZE 1 /* # of map entries at once */ + +/* + * Each contiguous block has a header, so it is not just a simple attribute + * length to FSB conversion. + */ +int +xfs_attr3_rmt_blocks( + struct xfs_mount *mp, + int attrlen) +{ + if (xfs_sb_version_hascrc(&mp->m_sb)) { + int buflen = XFS_ATTR3_RMT_BUF_SPACE(mp, mp->m_sb.sb_blocksize); + return (attrlen + buflen - 1) / buflen; + } + return XFS_B_TO_FSB(mp, attrlen); +} + +/* + * Checking of the remote attribute header is split into two parts. The verifier + * does CRC, location and bounds checking, the unpacking function checks the + * attribute parameters and owner. + */ +static bool +xfs_attr3_rmt_hdr_ok( + void *ptr, + xfs_ino_t ino, + uint32_t offset, + uint32_t size, + xfs_daddr_t bno) +{ + struct xfs_attr3_rmt_hdr *rmt = ptr; + + if (bno != be64_to_cpu(rmt->rm_blkno)) + return false; + if (offset != be32_to_cpu(rmt->rm_offset)) + return false; + if (size != be32_to_cpu(rmt->rm_bytes)) + return false; + if (ino != be64_to_cpu(rmt->rm_owner)) + return false; + + /* ok */ + return true; +} + +static bool +xfs_attr3_rmt_verify( + struct xfs_mount *mp, + void *ptr, + int fsbsize, + xfs_daddr_t bno) +{ + struct xfs_attr3_rmt_hdr *rmt = ptr; + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return false; + if (rmt->rm_magic != cpu_to_be32(XFS_ATTR3_RMT_MAGIC)) + return false; + if (!uuid_equal(&rmt->rm_uuid, &mp->m_sb.sb_uuid)) + return false; + if (be64_to_cpu(rmt->rm_blkno) != bno) + return false; + if (be32_to_cpu(rmt->rm_bytes) > fsbsize - sizeof(*rmt)) + return false; + if (be32_to_cpu(rmt->rm_offset) + + be32_to_cpu(rmt->rm_bytes) > XATTR_SIZE_MAX) + return false; + if (rmt->rm_owner == 0) + return false; + + return true; +} + +static void +xfs_attr3_rmt_read_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + char *ptr; + int len; + xfs_daddr_t bno; + int blksize = mp->m_attr_geo->blksize; + + /* no verification of non-crc buffers */ + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + ptr = bp->b_addr; + bno = bp->b_bn; + len = BBTOB(bp->b_length); + ASSERT(len >= blksize); + + while (len > 0) { + if (!xfs_verify_cksum(ptr, blksize, XFS_ATTR3_RMT_CRC_OFF)) { + xfs_buf_ioerror(bp, -EFSBADCRC); + break; + } + if (!xfs_attr3_rmt_verify(mp, ptr, blksize, bno)) { + xfs_buf_ioerror(bp, -EFSCORRUPTED); + break; + } + len -= blksize; + ptr += blksize; + bno += BTOBB(blksize); + } + + if (bp->b_error) + xfs_verifier_error(bp); + else + ASSERT(len == 0); +} + +static void +xfs_attr3_rmt_write_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; + char *ptr; + int len; + xfs_daddr_t bno; + int blksize = mp->m_attr_geo->blksize; + + /* no verification of non-crc buffers */ + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + ptr = bp->b_addr; + bno = bp->b_bn; + len = BBTOB(bp->b_length); + ASSERT(len >= blksize); + + while (len > 0) { + if (!xfs_attr3_rmt_verify(mp, ptr, blksize, bno)) { + xfs_buf_ioerror(bp, -EFSCORRUPTED); + xfs_verifier_error(bp); + return; + } + if (bip) { + struct xfs_attr3_rmt_hdr *rmt; + + rmt = (struct xfs_attr3_rmt_hdr *)ptr; + rmt->rm_lsn = cpu_to_be64(bip->bli_item.li_lsn); + } + xfs_update_cksum(ptr, blksize, XFS_ATTR3_RMT_CRC_OFF); + + len -= blksize; + ptr += blksize; + bno += BTOBB(blksize); + } + ASSERT(len == 0); +} + +const struct xfs_buf_ops xfs_attr3_rmt_buf_ops = { + .verify_read = xfs_attr3_rmt_read_verify, + .verify_write = xfs_attr3_rmt_write_verify, +}; + +STATIC int +xfs_attr3_rmt_hdr_set( + struct xfs_mount *mp, + void *ptr, + xfs_ino_t ino, + uint32_t offset, + uint32_t size, + xfs_daddr_t bno) +{ + struct xfs_attr3_rmt_hdr *rmt = ptr; + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return 0; + + rmt->rm_magic = cpu_to_be32(XFS_ATTR3_RMT_MAGIC); + rmt->rm_offset = cpu_to_be32(offset); + rmt->rm_bytes = cpu_to_be32(size); + uuid_copy(&rmt->rm_uuid, &mp->m_sb.sb_uuid); + rmt->rm_owner = cpu_to_be64(ino); + rmt->rm_blkno = cpu_to_be64(bno); + + return sizeof(struct xfs_attr3_rmt_hdr); +} + +/* + * Helper functions to copy attribute data in and out of the one disk extents + */ +STATIC int +xfs_attr_rmtval_copyout( + struct xfs_mount *mp, + struct xfs_buf *bp, + xfs_ino_t ino, + int *offset, + int *valuelen, + __uint8_t **dst) +{ + char *src = bp->b_addr; + xfs_daddr_t bno = bp->b_bn; + int len = BBTOB(bp->b_length); + int blksize = mp->m_attr_geo->blksize; + + ASSERT(len >= blksize); + + while (len > 0 && *valuelen > 0) { + int hdr_size = 0; + int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, blksize); + + byte_cnt = min(*valuelen, byte_cnt); + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (!xfs_attr3_rmt_hdr_ok(src, ino, *offset, + byte_cnt, bno)) { + xfs_alert(mp, +"remote attribute header mismatch bno/off/len/owner (0x%llx/0x%x/Ox%x/0x%llx)", + bno, *offset, byte_cnt, ino); + return -EFSCORRUPTED; + } + hdr_size = sizeof(struct xfs_attr3_rmt_hdr); + } + + memcpy(*dst, src + hdr_size, byte_cnt); + + /* roll buffer forwards */ + len -= blksize; + src += blksize; + bno += BTOBB(blksize); + + /* roll attribute data forwards */ + *valuelen -= byte_cnt; + *dst += byte_cnt; + *offset += byte_cnt; + } + return 0; +} + +STATIC void +xfs_attr_rmtval_copyin( + struct xfs_mount *mp, + struct xfs_buf *bp, + xfs_ino_t ino, + int *offset, + int *valuelen, + __uint8_t **src) +{ + char *dst = bp->b_addr; + xfs_daddr_t bno = bp->b_bn; + int len = BBTOB(bp->b_length); + int blksize = mp->m_attr_geo->blksize; + + ASSERT(len >= blksize); + + while (len > 0 && *valuelen > 0) { + int hdr_size; + int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, blksize); + + byte_cnt = min(*valuelen, byte_cnt); + hdr_size = xfs_attr3_rmt_hdr_set(mp, dst, ino, *offset, + byte_cnt, bno); + + memcpy(dst + hdr_size, *src, byte_cnt); + + /* + * If this is the last block, zero the remainder of it. + * Check that we are actually the last block, too. + */ + if (byte_cnt + hdr_size < blksize) { + ASSERT(*valuelen - byte_cnt == 0); + ASSERT(len == blksize); + memset(dst + hdr_size + byte_cnt, 0, + blksize - hdr_size - byte_cnt); + } + + /* roll buffer forwards */ + len -= blksize; + dst += blksize; + bno += BTOBB(blksize); + + /* roll attribute data forwards */ + *valuelen -= byte_cnt; + *src += byte_cnt; + *offset += byte_cnt; + } +} + +/* + * Read the value associated with an attribute from the out-of-line buffer + * that we stored it in. + */ +int +xfs_attr_rmtval_get( + struct xfs_da_args *args) +{ + struct xfs_bmbt_irec map[ATTR_RMTVALUE_MAPSIZE]; + struct xfs_mount *mp = args->dp->i_mount; + struct xfs_buf *bp; + xfs_dablk_t lblkno = args->rmtblkno; + __uint8_t *dst = args->value; + int valuelen; + int nmap; + int error; + int blkcnt = args->rmtblkcnt; + int i; + int offset = 0; + + trace_xfs_attr_rmtval_get(args); + + ASSERT(!(args->flags & ATTR_KERNOVAL)); + ASSERT(args->rmtvaluelen == args->valuelen); + + valuelen = args->rmtvaluelen; + while (valuelen > 0) { + nmap = ATTR_RMTVALUE_MAPSIZE; + error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno, + blkcnt, map, &nmap, + XFS_BMAPI_ATTRFORK); + if (error) + return error; + ASSERT(nmap >= 1); + + for (i = 0; (i < nmap) && (valuelen > 0); i++) { + xfs_daddr_t dblkno; + int dblkcnt; + + ASSERT((map[i].br_startblock != DELAYSTARTBLOCK) && + (map[i].br_startblock != HOLESTARTBLOCK)); + dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock); + dblkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount); + error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, + dblkno, dblkcnt, 0, &bp, + &xfs_attr3_rmt_buf_ops); + if (error) + return error; + + error = xfs_attr_rmtval_copyout(mp, bp, args->dp->i_ino, + &offset, &valuelen, + &dst); + xfs_buf_relse(bp); + if (error) + return error; + + /* roll attribute extent map forwards */ + lblkno += map[i].br_blockcount; + blkcnt -= map[i].br_blockcount; + } + } + ASSERT(valuelen == 0); + return 0; +} + +/* + * Write the value associated with an attribute into the out-of-line buffer + * that we have defined for it. + */ +int +xfs_attr_rmtval_set( + struct xfs_da_args *args) +{ + struct xfs_inode *dp = args->dp; + struct xfs_mount *mp = dp->i_mount; + struct xfs_bmbt_irec map; + xfs_dablk_t lblkno; + xfs_fileoff_t lfileoff = 0; + __uint8_t *src = args->value; + int blkcnt; + int valuelen; + int nmap; + int error; + int offset = 0; + + trace_xfs_attr_rmtval_set(args); + + /* + * Find a "hole" in the attribute address space large enough for + * us to drop the new attribute's value into. Because CRC enable + * attributes have headers, we can't just do a straight byte to FSB + * conversion and have to take the header space into account. + */ + blkcnt = xfs_attr3_rmt_blocks(mp, args->rmtvaluelen); + error = xfs_bmap_first_unused(args->trans, args->dp, blkcnt, &lfileoff, + XFS_ATTR_FORK); + if (error) + return error; + + args->rmtblkno = lblkno = (xfs_dablk_t)lfileoff; + args->rmtblkcnt = blkcnt; + + /* + * Roll through the "value", allocating blocks on disk as required. + */ + while (blkcnt > 0) { + int committed; + + /* + * Allocate a single extent, up to the size of the value. + */ + xfs_bmap_init(args->flist, args->firstblock); + nmap = 1; + error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)lblkno, + blkcnt, + XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, + args->firstblock, args->total, &map, &nmap, + args->flist); + if (!error) { + error = xfs_bmap_finish(&args->trans, args->flist, + &committed); + } + if (error) { + ASSERT(committed); + args->trans = NULL; + xfs_bmap_cancel(args->flist); + return error; + } + + /* + * bmap_finish() may have committed the last trans and started + * a new one. We need the inode to be in all transactions. + */ + if (committed) + xfs_trans_ijoin(args->trans, dp, 0); + + ASSERT(nmap == 1); + ASSERT((map.br_startblock != DELAYSTARTBLOCK) && + (map.br_startblock != HOLESTARTBLOCK)); + lblkno += map.br_blockcount; + blkcnt -= map.br_blockcount; + + /* + * Start the next trans in the chain. + */ + error = xfs_trans_roll(&args->trans, dp); + if (error) + return error; + } + + /* + * Roll through the "value", copying the attribute value to the + * already-allocated blocks. Blocks are written synchronously + * so that we can know they are all on disk before we turn off + * the INCOMPLETE flag. + */ + lblkno = args->rmtblkno; + blkcnt = args->rmtblkcnt; + valuelen = args->rmtvaluelen; + while (valuelen > 0) { + struct xfs_buf *bp; + xfs_daddr_t dblkno; + int dblkcnt; + + ASSERT(blkcnt > 0); + + xfs_bmap_init(args->flist, args->firstblock); + nmap = 1; + error = xfs_bmapi_read(dp, (xfs_fileoff_t)lblkno, + blkcnt, &map, &nmap, + XFS_BMAPI_ATTRFORK); + if (error) + return error; + ASSERT(nmap == 1); + ASSERT((map.br_startblock != DELAYSTARTBLOCK) && + (map.br_startblock != HOLESTARTBLOCK)); + + dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock), + dblkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); + + bp = xfs_buf_get(mp->m_ddev_targp, dblkno, dblkcnt, 0); + if (!bp) + return -ENOMEM; + bp->b_ops = &xfs_attr3_rmt_buf_ops; + + xfs_attr_rmtval_copyin(mp, bp, args->dp->i_ino, &offset, + &valuelen, &src); + + error = xfs_bwrite(bp); /* GROT: NOTE: synchronous write */ + xfs_buf_relse(bp); + if (error) + return error; + + + /* roll attribute extent map forwards */ + lblkno += map.br_blockcount; + blkcnt -= map.br_blockcount; + } + ASSERT(valuelen == 0); + return 0; +} + +/* + * Remove the value associated with an attribute by deleting the + * out-of-line buffer that it is stored on. + */ +int +xfs_attr_rmtval_remove( + struct xfs_da_args *args) +{ + struct xfs_mount *mp = args->dp->i_mount; + xfs_dablk_t lblkno; + int blkcnt; + int error; + int done; + + trace_xfs_attr_rmtval_remove(args); + + /* + * Roll through the "value", invalidating the attribute value's blocks. + */ + lblkno = args->rmtblkno; + blkcnt = args->rmtblkcnt; + while (blkcnt > 0) { + struct xfs_bmbt_irec map; + struct xfs_buf *bp; + xfs_daddr_t dblkno; + int dblkcnt; + int nmap; + + /* + * Try to remember where we decided to put the value. + */ + nmap = 1; + error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno, + blkcnt, &map, &nmap, XFS_BMAPI_ATTRFORK); + if (error) + return error; + ASSERT(nmap == 1); + ASSERT((map.br_startblock != DELAYSTARTBLOCK) && + (map.br_startblock != HOLESTARTBLOCK)); + + dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock), + dblkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); + + /* + * If the "remote" value is in the cache, remove it. + */ + bp = xfs_incore(mp->m_ddev_targp, dblkno, dblkcnt, XBF_TRYLOCK); + if (bp) { + xfs_buf_stale(bp); + xfs_buf_relse(bp); + bp = NULL; + } + + lblkno += map.br_blockcount; + blkcnt -= map.br_blockcount; + } + + /* + * Keep de-allocating extents until the remote-value region is gone. + */ + lblkno = args->rmtblkno; + blkcnt = args->rmtblkcnt; + done = 0; + while (!done) { + int committed; + + xfs_bmap_init(args->flist, args->firstblock); + error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt, + XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, + 1, args->firstblock, args->flist, + &done); + if (!error) { + error = xfs_bmap_finish(&args->trans, args->flist, + &committed); + } + if (error) { + ASSERT(committed); + args->trans = NULL; + xfs_bmap_cancel(args->flist); + return error; + } + + /* + * bmap_finish() may have committed the last trans and started + * a new one. We need the inode to be in all transactions. + */ + if (committed) + xfs_trans_ijoin(args->trans, args->dp, 0); + + /* + * Close out trans and start the next one in the chain. + */ + error = xfs_trans_roll(&args->trans, args->dp); + if (error) + return error; + } + return 0; +} diff --git a/kernel/fs/xfs/libxfs/xfs_attr_remote.h b/kernel/fs/xfs/libxfs/xfs_attr_remote.h new file mode 100644 index 000000000..5a9acfa15 --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_attr_remote.h @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2013 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_ATTR_REMOTE_H__ +#define __XFS_ATTR_REMOTE_H__ + +int xfs_attr3_rmt_blocks(struct xfs_mount *mp, int attrlen); + +int xfs_attr_rmtval_get(struct xfs_da_args *args); +int xfs_attr_rmtval_set(struct xfs_da_args *args); +int xfs_attr_rmtval_remove(struct xfs_da_args *args); + +#endif /* __XFS_ATTR_REMOTE_H__ */ diff --git a/kernel/fs/xfs/libxfs/xfs_attr_sf.h b/kernel/fs/xfs/libxfs/xfs_attr_sf.h new file mode 100644 index 000000000..919756e3b --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_attr_sf.h @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_ATTR_SF_H__ +#define __XFS_ATTR_SF_H__ + +/* + * Attribute storage when stored inside the inode. + * + * Small attribute lists are packed as tightly as possible so as + * to fit into the literal area of the inode. + */ + +/* + * Entries are packed toward the top as tight as possible. + */ +typedef struct xfs_attr_shortform { + struct xfs_attr_sf_hdr { /* constant-structure header block */ + __be16 totsize; /* total bytes in shortform list */ + __u8 count; /* count of active entries */ + } hdr; + struct xfs_attr_sf_entry { + __uint8_t namelen; /* actual length of name (no NULL) */ + __uint8_t valuelen; /* actual length of value (no NULL) */ + __uint8_t flags; /* flags bits (see xfs_attr_leaf.h) */ + __uint8_t nameval[1]; /* name & value bytes concatenated */ + } list[1]; /* variable sized array */ +} xfs_attr_shortform_t; +typedef struct xfs_attr_sf_hdr xfs_attr_sf_hdr_t; +typedef struct xfs_attr_sf_entry xfs_attr_sf_entry_t; + +/* + * We generate this then sort it, attr_list() must return things in hash-order. + */ +typedef struct xfs_attr_sf_sort { + __uint8_t entno; /* entry number in original list */ + __uint8_t namelen; /* length of name value (no null) */ + __uint8_t valuelen; /* length of value */ + __uint8_t flags; /* flags bits (see xfs_attr_leaf.h) */ + xfs_dahash_t hash; /* this entry's hash value */ + unsigned char *name; /* name value, pointer into buffer */ +} xfs_attr_sf_sort_t; + +#define XFS_ATTR_SF_ENTSIZE_BYNAME(nlen,vlen) /* space name/value uses */ \ + (((int)sizeof(xfs_attr_sf_entry_t)-1 + (nlen)+(vlen))) +#define XFS_ATTR_SF_ENTSIZE_MAX /* max space for name&value */ \ + ((1 << (NBBY*(int)sizeof(__uint8_t))) - 1) +#define XFS_ATTR_SF_ENTSIZE(sfep) /* space an entry uses */ \ + ((int)sizeof(xfs_attr_sf_entry_t)-1 + (sfep)->namelen+(sfep)->valuelen) +#define XFS_ATTR_SF_NEXTENTRY(sfep) /* next entry in struct */ \ + ((xfs_attr_sf_entry_t *)((char *)(sfep) + XFS_ATTR_SF_ENTSIZE(sfep))) +#define XFS_ATTR_SF_TOTSIZE(dp) /* total space in use */ \ + (be16_to_cpu(((xfs_attr_shortform_t *) \ + ((dp)->i_afp->if_u1.if_data))->hdr.totsize)) + +#endif /* __XFS_ATTR_SF_H__ */ diff --git a/kernel/fs/xfs/libxfs/xfs_bit.h b/kernel/fs/xfs/libxfs/xfs_bit.h new file mode 100644 index 000000000..e1649c0d3 --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_bit.h @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_BIT_H__ +#define __XFS_BIT_H__ + +/* + * XFS bit manipulation routines. + */ + +/* + * masks with n high/low bits set, 64-bit values + */ +static inline __uint64_t xfs_mask64hi(int n) +{ + return (__uint64_t)-1 << (64 - (n)); +} +static inline __uint32_t xfs_mask32lo(int n) +{ + return ((__uint32_t)1 << (n)) - 1; +} +static inline __uint64_t xfs_mask64lo(int n) +{ + return ((__uint64_t)1 << (n)) - 1; +} + +/* Get high bit set out of 32-bit argument, -1 if none set */ +static inline int xfs_highbit32(__uint32_t v) +{ + return fls(v) - 1; +} + +/* Get high bit set out of 64-bit argument, -1 if none set */ +static inline int xfs_highbit64(__uint64_t v) +{ + return fls64(v) - 1; +} + +/* Get low bit set out of 32-bit argument, -1 if none set */ +static inline int xfs_lowbit32(__uint32_t v) +{ + return ffs(v) - 1; +} + +/* Get low bit set out of 64-bit argument, -1 if none set */ +static inline int xfs_lowbit64(__uint64_t v) +{ + __uint32_t w = (__uint32_t)v; + int n = 0; + + if (w) { /* lower bits */ + n = ffs(w); + } else { /* upper bits */ + w = (__uint32_t)(v >> 32); + if (w) { + n = ffs(w); + if (n) + n += 32; + } + } + return n - 1; +} + +/* Return whether bitmap is empty (1 == empty) */ +extern int xfs_bitmap_empty(uint *map, uint size); + +/* Count continuous one bits in map starting with start_bit */ +extern int xfs_contig_bits(uint *map, uint size, uint start_bit); + +/* Find next set bit in map */ +extern int xfs_next_bit(uint *map, uint size, uint start_bit); + +#endif /* __XFS_BIT_H__ */ diff --git a/kernel/fs/xfs/libxfs/xfs_bmap.c b/kernel/fs/xfs/libxfs/xfs_bmap.c new file mode 100644 index 000000000..f1026e86d --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_bmap.c @@ -0,0 +1,5945 @@ +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_dir2.h" +#include "xfs_inode.h" +#include "xfs_btree.h" +#include "xfs_trans.h" +#include "xfs_inode_item.h" +#include "xfs_extfree_item.h" +#include "xfs_alloc.h" +#include "xfs_bmap.h" +#include "xfs_bmap_util.h" +#include "xfs_bmap_btree.h" +#include "xfs_rtalloc.h" +#include "xfs_error.h" +#include "xfs_quota.h" +#include "xfs_trans_space.h" +#include "xfs_buf_item.h" +#include "xfs_trace.h" +#include "xfs_symlink.h" +#include "xfs_attr_leaf.h" +#include "xfs_filestream.h" + + +kmem_zone_t *xfs_bmap_free_item_zone; + +/* + * Miscellaneous helper functions + */ + +/* + * Compute and fill in the value of the maximum depth of a bmap btree + * in this filesystem. Done once, during mount. + */ +void +xfs_bmap_compute_maxlevels( + xfs_mount_t *mp, /* file system mount structure */ + int whichfork) /* data or attr fork */ +{ + int level; /* btree level */ + uint maxblocks; /* max blocks at this level */ + uint maxleafents; /* max leaf entries possible */ + int maxrootrecs; /* max records in root block */ + int minleafrecs; /* min records in leaf block */ + int minnoderecs; /* min records in node block */ + int sz; /* root block size */ + + /* + * The maximum number of extents in a file, hence the maximum + * number of leaf entries, is controlled by the type of di_nextents + * (a signed 32-bit number, xfs_extnum_t), or by di_anextents + * (a signed 16-bit number, xfs_aextnum_t). + * + * Note that we can no longer assume that if we are in ATTR1 that + * the fork offset of all the inodes will be + * (xfs_default_attroffset(ip) >> 3) because we could have mounted + * with ATTR2 and then mounted back with ATTR1, keeping the + * di_forkoff's fixed but probably at various positions. Therefore, + * for both ATTR1 and ATTR2 we have to assume the worst case scenario + * of a minimum size available. + */ + if (whichfork == XFS_DATA_FORK) { + maxleafents = MAXEXTNUM; + sz = XFS_BMDR_SPACE_CALC(MINDBTPTRS); + } else { + maxleafents = MAXAEXTNUM; + sz = XFS_BMDR_SPACE_CALC(MINABTPTRS); + } + maxrootrecs = xfs_bmdr_maxrecs(sz, 0); + minleafrecs = mp->m_bmap_dmnr[0]; + minnoderecs = mp->m_bmap_dmnr[1]; + maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs; + for (level = 1; maxblocks > 1; level++) { + if (maxblocks <= maxrootrecs) + maxblocks = 1; + else + maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs; + } + mp->m_bm_maxlevels[whichfork] = level; +} + +STATIC int /* error */ +xfs_bmbt_lookup_eq( + struct xfs_btree_cur *cur, + xfs_fileoff_t off, + xfs_fsblock_t bno, + xfs_filblks_t len, + int *stat) /* success/failure */ +{ + cur->bc_rec.b.br_startoff = off; + cur->bc_rec.b.br_startblock = bno; + cur->bc_rec.b.br_blockcount = len; + return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat); +} + +STATIC int /* error */ +xfs_bmbt_lookup_ge( + struct xfs_btree_cur *cur, + xfs_fileoff_t off, + xfs_fsblock_t bno, + xfs_filblks_t len, + int *stat) /* success/failure */ +{ + cur->bc_rec.b.br_startoff = off; + cur->bc_rec.b.br_startblock = bno; + cur->bc_rec.b.br_blockcount = len; + return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat); +} + +/* + * Check if the inode needs to be converted to btree format. + */ +static inline bool xfs_bmap_needs_btree(struct xfs_inode *ip, int whichfork) +{ + return XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_NEXTENTS(ip, whichfork) > + XFS_IFORK_MAXEXT(ip, whichfork); +} + +/* + * Check if the inode should be converted to extent format. + */ +static inline bool xfs_bmap_wants_extents(struct xfs_inode *ip, int whichfork) +{ + return XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE && + XFS_IFORK_NEXTENTS(ip, whichfork) <= + XFS_IFORK_MAXEXT(ip, whichfork); +} + +/* + * Update the record referred to by cur to the value given + * by [off, bno, len, state]. + * This either works (return 0) or gets an EFSCORRUPTED error. + */ +STATIC int +xfs_bmbt_update( + struct xfs_btree_cur *cur, + xfs_fileoff_t off, + xfs_fsblock_t bno, + xfs_filblks_t len, + xfs_exntst_t state) +{ + union xfs_btree_rec rec; + + xfs_bmbt_disk_set_allf(&rec.bmbt, off, bno, len, state); + return xfs_btree_update(cur, &rec); +} + +/* + * Compute the worst-case number of indirect blocks that will be used + * for ip's delayed extent of length "len". + */ +STATIC xfs_filblks_t +xfs_bmap_worst_indlen( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_filblks_t len) /* delayed extent length */ +{ + int level; /* btree level number */ + int maxrecs; /* maximum record count at this level */ + xfs_mount_t *mp; /* mount structure */ + xfs_filblks_t rval; /* return value */ + + mp = ip->i_mount; + maxrecs = mp->m_bmap_dmxr[0]; + for (level = 0, rval = 0; + level < XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK); + level++) { + len += maxrecs - 1; + do_div(len, maxrecs); + rval += len; + if (len == 1) + return rval + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) - + level - 1; + if (level == 0) + maxrecs = mp->m_bmap_dmxr[1]; + } + return rval; +} + +/* + * Calculate the default attribute fork offset for newly created inodes. + */ +uint +xfs_default_attroffset( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + uint offset; + + if (mp->m_sb.sb_inodesize == 256) { + offset = XFS_LITINO(mp, ip->i_d.di_version) - + XFS_BMDR_SPACE_CALC(MINABTPTRS); + } else { + offset = XFS_BMDR_SPACE_CALC(6 * MINABTPTRS); + } + + ASSERT(offset < XFS_LITINO(mp, ip->i_d.di_version)); + return offset; +} + +/* + * Helper routine to reset inode di_forkoff field when switching + * attribute fork from local to extent format - we reset it where + * possible to make space available for inline data fork extents. + */ +STATIC void +xfs_bmap_forkoff_reset( + xfs_inode_t *ip, + int whichfork) +{ + if (whichfork == XFS_ATTR_FORK && + ip->i_d.di_format != XFS_DINODE_FMT_DEV && + ip->i_d.di_format != XFS_DINODE_FMT_UUID && + ip->i_d.di_format != XFS_DINODE_FMT_BTREE) { + uint dfl_forkoff = xfs_default_attroffset(ip) >> 3; + + if (dfl_forkoff > ip->i_d.di_forkoff) + ip->i_d.di_forkoff = dfl_forkoff; + } +} + +#ifdef DEBUG +STATIC struct xfs_buf * +xfs_bmap_get_bp( + struct xfs_btree_cur *cur, + xfs_fsblock_t bno) +{ + struct xfs_log_item_desc *lidp; + int i; + + if (!cur) + return NULL; + + for (i = 0; i < XFS_BTREE_MAXLEVELS; i++) { + if (!cur->bc_bufs[i]) + break; + if (XFS_BUF_ADDR(cur->bc_bufs[i]) == bno) + return cur->bc_bufs[i]; + } + + /* Chase down all the log items to see if the bp is there */ + list_for_each_entry(lidp, &cur->bc_tp->t_items, lid_trans) { + struct xfs_buf_log_item *bip; + bip = (struct xfs_buf_log_item *)lidp->lid_item; + if (bip->bli_item.li_type == XFS_LI_BUF && + XFS_BUF_ADDR(bip->bli_buf) == bno) + return bip->bli_buf; + } + + return NULL; +} + +STATIC void +xfs_check_block( + struct xfs_btree_block *block, + xfs_mount_t *mp, + int root, + short sz) +{ + int i, j, dmxr; + __be64 *pp, *thispa; /* pointer to block address */ + xfs_bmbt_key_t *prevp, *keyp; + + ASSERT(be16_to_cpu(block->bb_level) > 0); + + prevp = NULL; + for( i = 1; i <= xfs_btree_get_numrecs(block); i++) { + dmxr = mp->m_bmap_dmxr[0]; + keyp = XFS_BMBT_KEY_ADDR(mp, block, i); + + if (prevp) { + ASSERT(be64_to_cpu(prevp->br_startoff) < + be64_to_cpu(keyp->br_startoff)); + } + prevp = keyp; + + /* + * Compare the block numbers to see if there are dups. + */ + if (root) + pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, i, sz); + else + pp = XFS_BMBT_PTR_ADDR(mp, block, i, dmxr); + + for (j = i+1; j <= be16_to_cpu(block->bb_numrecs); j++) { + if (root) + thispa = XFS_BMAP_BROOT_PTR_ADDR(mp, block, j, sz); + else + thispa = XFS_BMBT_PTR_ADDR(mp, block, j, dmxr); + if (*thispa == *pp) { + xfs_warn(mp, "%s: thispa(%d) == pp(%d) %Ld", + __func__, j, i, + (unsigned long long)be64_to_cpu(*thispa)); + panic("%s: ptrs are equal in node\n", + __func__); + } + } + } +} + +/* + * Check that the extents for the inode ip are in the right order in all + * btree leaves. + */ + +STATIC void +xfs_bmap_check_leaf_extents( + xfs_btree_cur_t *cur, /* btree cursor or null */ + xfs_inode_t *ip, /* incore inode pointer */ + int whichfork) /* data or attr fork */ +{ + struct xfs_btree_block *block; /* current btree block */ + xfs_fsblock_t bno; /* block # of "block" */ + xfs_buf_t *bp; /* buffer for "block" */ + int error; /* error return value */ + xfs_extnum_t i=0, j; /* index into the extents list */ + xfs_ifork_t *ifp; /* fork structure */ + int level; /* btree level, for checking */ + xfs_mount_t *mp; /* file system mount structure */ + __be64 *pp; /* pointer to block address */ + xfs_bmbt_rec_t *ep; /* pointer to current extent */ + xfs_bmbt_rec_t last = {0, 0}; /* last extent in prev block */ + xfs_bmbt_rec_t *nextp; /* pointer to next extent */ + int bp_release = 0; + + if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) { + return; + } + + bno = NULLFSBLOCK; + mp = ip->i_mount; + ifp = XFS_IFORK_PTR(ip, whichfork); + block = ifp->if_broot; + /* + * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out. + */ + level = be16_to_cpu(block->bb_level); + ASSERT(level > 0); + xfs_check_block(block, mp, 1, ifp->if_broot_bytes); + pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes); + bno = be64_to_cpu(*pp); + + ASSERT(bno != NULLFSBLOCK); + ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount); + ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks); + + /* + * Go down the tree until leaf level is reached, following the first + * pointer (leftmost) at each level. + */ + while (level-- > 0) { + /* See if buf is in cur first */ + bp_release = 0; + bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno)); + if (!bp) { + bp_release = 1; + error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp, + XFS_BMAP_BTREE_REF, + &xfs_bmbt_buf_ops); + if (error) + goto error_norelse; + } + block = XFS_BUF_TO_BLOCK(bp); + if (level == 0) + break; + + /* + * Check this block for basic sanity (increasing keys and + * no duplicate blocks). + */ + + xfs_check_block(block, mp, 0, 0); + pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]); + bno = be64_to_cpu(*pp); + XFS_WANT_CORRUPTED_GOTO(mp, + XFS_FSB_SANITY_CHECK(mp, bno), error0); + if (bp_release) { + bp_release = 0; + xfs_trans_brelse(NULL, bp); + } + } + + /* + * Here with bp and block set to the leftmost leaf node in the tree. + */ + i = 0; + + /* + * Loop over all leaf nodes checking that all extents are in the right order. + */ + for (;;) { + xfs_fsblock_t nextbno; + xfs_extnum_t num_recs; + + + num_recs = xfs_btree_get_numrecs(block); + + /* + * Read-ahead the next leaf block, if any. + */ + + nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); + + /* + * Check all the extents to make sure they are OK. + * If we had a previous block, the last entry should + * conform with the first entry in this one. + */ + + ep = XFS_BMBT_REC_ADDR(mp, block, 1); + if (i) { + ASSERT(xfs_bmbt_disk_get_startoff(&last) + + xfs_bmbt_disk_get_blockcount(&last) <= + xfs_bmbt_disk_get_startoff(ep)); + } + for (j = 1; j < num_recs; j++) { + nextp = XFS_BMBT_REC_ADDR(mp, block, j + 1); + ASSERT(xfs_bmbt_disk_get_startoff(ep) + + xfs_bmbt_disk_get_blockcount(ep) <= + xfs_bmbt_disk_get_startoff(nextp)); + ep = nextp; + } + + last = *ep; + i += num_recs; + if (bp_release) { + bp_release = 0; + xfs_trans_brelse(NULL, bp); + } + bno = nextbno; + /* + * If we've reached the end, stop. + */ + if (bno == NULLFSBLOCK) + break; + + bp_release = 0; + bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno)); + if (!bp) { + bp_release = 1; + error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp, + XFS_BMAP_BTREE_REF, + &xfs_bmbt_buf_ops); + if (error) + goto error_norelse; + } + block = XFS_BUF_TO_BLOCK(bp); + } + if (bp_release) { + bp_release = 0; + xfs_trans_brelse(NULL, bp); + } + return; + +error0: + xfs_warn(mp, "%s: at error0", __func__); + if (bp_release) + xfs_trans_brelse(NULL, bp); +error_norelse: + xfs_warn(mp, "%s: BAD after btree leaves for %d extents", + __func__, i); + panic("%s: CORRUPTED BTREE OR SOMETHING", __func__); + return; +} + +/* + * Add bmap trace insert entries for all the contents of the extent records. + */ +void +xfs_bmap_trace_exlist( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_extnum_t cnt, /* count of entries in the list */ + int whichfork, /* data or attr fork */ + unsigned long caller_ip) +{ + xfs_extnum_t idx; /* extent record index */ + xfs_ifork_t *ifp; /* inode fork pointer */ + int state = 0; + + if (whichfork == XFS_ATTR_FORK) + state |= BMAP_ATTRFORK; + + ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT(cnt == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))); + for (idx = 0; idx < cnt; idx++) + trace_xfs_extlist(ip, idx, whichfork, caller_ip); +} + +/* + * Validate that the bmbt_irecs being returned from bmapi are valid + * given the caller's original parameters. Specifically check the + * ranges of the returned irecs to ensure that they only extend beyond + * the given parameters if the XFS_BMAPI_ENTIRE flag was set. + */ +STATIC void +xfs_bmap_validate_ret( + xfs_fileoff_t bno, + xfs_filblks_t len, + int flags, + xfs_bmbt_irec_t *mval, + int nmap, + int ret_nmap) +{ + int i; /* index to map values */ + + ASSERT(ret_nmap <= nmap); + + for (i = 0; i < ret_nmap; i++) { + ASSERT(mval[i].br_blockcount > 0); + if (!(flags & XFS_BMAPI_ENTIRE)) { + ASSERT(mval[i].br_startoff >= bno); + ASSERT(mval[i].br_blockcount <= len); + ASSERT(mval[i].br_startoff + mval[i].br_blockcount <= + bno + len); + } else { + ASSERT(mval[i].br_startoff < bno + len); + ASSERT(mval[i].br_startoff + mval[i].br_blockcount > + bno); + } + ASSERT(i == 0 || + mval[i - 1].br_startoff + mval[i - 1].br_blockcount == + mval[i].br_startoff); + ASSERT(mval[i].br_startblock != DELAYSTARTBLOCK && + mval[i].br_startblock != HOLESTARTBLOCK); + ASSERT(mval[i].br_state == XFS_EXT_NORM || + mval[i].br_state == XFS_EXT_UNWRITTEN); + } +} + +#else +#define xfs_bmap_check_leaf_extents(cur, ip, whichfork) do { } while (0) +#define xfs_bmap_validate_ret(bno,len,flags,mval,onmap,nmap) +#endif /* DEBUG */ + +/* + * bmap free list manipulation functions + */ + +/* + * Add the extent to the list of extents to be free at transaction end. + * The list is maintained sorted (by block number). + */ +void +xfs_bmap_add_free( + xfs_fsblock_t bno, /* fs block number of extent */ + xfs_filblks_t len, /* length of extent */ + xfs_bmap_free_t *flist, /* list of extents */ + xfs_mount_t *mp) /* mount point structure */ +{ + xfs_bmap_free_item_t *cur; /* current (next) element */ + xfs_bmap_free_item_t *new; /* new element */ + xfs_bmap_free_item_t *prev; /* previous element */ +#ifdef DEBUG + xfs_agnumber_t agno; + xfs_agblock_t agbno; + + ASSERT(bno != NULLFSBLOCK); + ASSERT(len > 0); + ASSERT(len <= MAXEXTLEN); + ASSERT(!isnullstartblock(bno)); + agno = XFS_FSB_TO_AGNO(mp, bno); + agbno = XFS_FSB_TO_AGBNO(mp, bno); + ASSERT(agno < mp->m_sb.sb_agcount); + ASSERT(agbno < mp->m_sb.sb_agblocks); + ASSERT(len < mp->m_sb.sb_agblocks); + ASSERT(agbno + len <= mp->m_sb.sb_agblocks); +#endif + ASSERT(xfs_bmap_free_item_zone != NULL); + new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP); + new->xbfi_startblock = bno; + new->xbfi_blockcount = (xfs_extlen_t)len; + for (prev = NULL, cur = flist->xbf_first; + cur != NULL; + prev = cur, cur = cur->xbfi_next) { + if (cur->xbfi_startblock >= bno) + break; + } + if (prev) + prev->xbfi_next = new; + else + flist->xbf_first = new; + new->xbfi_next = cur; + flist->xbf_count++; +} + +/* + * Remove the entry "free" from the free item list. Prev points to the + * previous entry, unless "free" is the head of the list. + */ +void +xfs_bmap_del_free( + xfs_bmap_free_t *flist, /* free item list header */ + xfs_bmap_free_item_t *prev, /* previous item on list, if any */ + xfs_bmap_free_item_t *free) /* list item to be freed */ +{ + if (prev) + prev->xbfi_next = free->xbfi_next; + else + flist->xbf_first = free->xbfi_next; + flist->xbf_count--; + kmem_zone_free(xfs_bmap_free_item_zone, free); +} + +/* + * Free up any items left in the list. + */ +void +xfs_bmap_cancel( + xfs_bmap_free_t *flist) /* list of bmap_free_items */ +{ + xfs_bmap_free_item_t *free; /* free list item */ + xfs_bmap_free_item_t *next; + + if (flist->xbf_count == 0) + return; + ASSERT(flist->xbf_first != NULL); + for (free = flist->xbf_first; free; free = next) { + next = free->xbfi_next; + xfs_bmap_del_free(flist, NULL, free); + } + ASSERT(flist->xbf_count == 0); +} + +/* + * Inode fork format manipulation functions + */ + +/* + * Transform a btree format file with only one leaf node, where the + * extents list will fit in the inode, into an extents format file. + * Since the file extents are already in-core, all we have to do is + * give up the space for the btree root and pitch the leaf block. + */ +STATIC int /* error */ +xfs_bmap_btree_to_extents( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode pointer */ + xfs_btree_cur_t *cur, /* btree cursor */ + int *logflagsp, /* inode logging flags */ + int whichfork) /* data or attr fork */ +{ + /* REFERENCED */ + struct xfs_btree_block *cblock;/* child btree block */ + xfs_fsblock_t cbno; /* child block number */ + xfs_buf_t *cbp; /* child block's buffer */ + int error; /* error return value */ + xfs_ifork_t *ifp; /* inode fork data */ + xfs_mount_t *mp; /* mount point structure */ + __be64 *pp; /* ptr to block address */ + struct xfs_btree_block *rblock;/* root btree block */ + + mp = ip->i_mount; + ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT(ifp->if_flags & XFS_IFEXTENTS); + ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE); + rblock = ifp->if_broot; + ASSERT(be16_to_cpu(rblock->bb_level) == 1); + ASSERT(be16_to_cpu(rblock->bb_numrecs) == 1); + ASSERT(xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0) == 1); + pp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, ifp->if_broot_bytes); + cbno = be64_to_cpu(*pp); + *logflagsp = 0; +#ifdef DEBUG + if ((error = xfs_btree_check_lptr(cur, cbno, 1))) + return error; +#endif + error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp, XFS_BMAP_BTREE_REF, + &xfs_bmbt_buf_ops); + if (error) + return error; + cblock = XFS_BUF_TO_BLOCK(cbp); + if ((error = xfs_btree_check_block(cur, cblock, 0, cbp))) + return error; + xfs_bmap_add_free(cbno, 1, cur->bc_private.b.flist, mp); + ip->i_d.di_nblocks--; + xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L); + xfs_trans_binval(tp, cbp); + if (cur->bc_bufs[0] == cbp) + cur->bc_bufs[0] = NULL; + xfs_iroot_realloc(ip, -1, whichfork); + ASSERT(ifp->if_broot == NULL); + ASSERT((ifp->if_flags & XFS_IFBROOT) == 0); + XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS); + *logflagsp = XFS_ILOG_CORE | xfs_ilog_fext(whichfork); + return 0; +} + +/* + * Convert an extents-format file into a btree-format file. + * The new file will have a root block (in the inode) and a single child block. + */ +STATIC int /* error */ +xfs_bmap_extents_to_btree( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode pointer */ + xfs_fsblock_t *firstblock, /* first-block-allocated */ + xfs_bmap_free_t *flist, /* blocks freed in xaction */ + xfs_btree_cur_t **curp, /* cursor returned to caller */ + int wasdel, /* converting a delayed alloc */ + int *logflagsp, /* inode logging flags */ + int whichfork) /* data or attr fork */ +{ + struct xfs_btree_block *ablock; /* allocated (child) bt block */ + xfs_buf_t *abp; /* buffer for ablock */ + xfs_alloc_arg_t args; /* allocation arguments */ + xfs_bmbt_rec_t *arp; /* child record pointer */ + struct xfs_btree_block *block; /* btree root block */ + xfs_btree_cur_t *cur; /* bmap btree cursor */ + xfs_bmbt_rec_host_t *ep; /* extent record pointer */ + int error; /* error return value */ + xfs_extnum_t i, cnt; /* extent record index */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_bmbt_key_t *kp; /* root block key pointer */ + xfs_mount_t *mp; /* mount structure */ + xfs_extnum_t nextents; /* number of file extents */ + xfs_bmbt_ptr_t *pp; /* root block address pointer */ + + mp = ip->i_mount; + ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS); + + /* + * Make space in the inode incore. + */ + xfs_iroot_realloc(ip, 1, whichfork); + ifp->if_flags |= XFS_IFBROOT; + + /* + * Fill in the root. + */ + block = ifp->if_broot; + if (xfs_sb_version_hascrc(&mp->m_sb)) + xfs_btree_init_block_int(mp, block, XFS_BUF_DADDR_NULL, + XFS_BMAP_CRC_MAGIC, 1, 1, ip->i_ino, + XFS_BTREE_LONG_PTRS | XFS_BTREE_CRC_BLOCKS); + else + xfs_btree_init_block_int(mp, block, XFS_BUF_DADDR_NULL, + XFS_BMAP_MAGIC, 1, 1, ip->i_ino, + XFS_BTREE_LONG_PTRS); + + /* + * Need a cursor. Can't allocate until bb_level is filled in. + */ + cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); + cur->bc_private.b.firstblock = *firstblock; + cur->bc_private.b.flist = flist; + cur->bc_private.b.flags = wasdel ? XFS_BTCUR_BPRV_WASDEL : 0; + /* + * Convert to a btree with two levels, one record in root. + */ + XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_BTREE); + memset(&args, 0, sizeof(args)); + args.tp = tp; + args.mp = mp; + args.firstblock = *firstblock; + if (*firstblock == NULLFSBLOCK) { + args.type = XFS_ALLOCTYPE_START_BNO; + args.fsbno = XFS_INO_TO_FSB(mp, ip->i_ino); + } else if (flist->xbf_low) { + args.type = XFS_ALLOCTYPE_START_BNO; + args.fsbno = *firstblock; + } else { + args.type = XFS_ALLOCTYPE_NEAR_BNO; + args.fsbno = *firstblock; + } + args.minlen = args.maxlen = args.prod = 1; + args.wasdel = wasdel; + *logflagsp = 0; + if ((error = xfs_alloc_vextent(&args))) { + xfs_iroot_realloc(ip, -1, whichfork); + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + return error; + } + /* + * Allocation can't fail, the space was reserved. + */ + ASSERT(args.fsbno != NULLFSBLOCK); + ASSERT(*firstblock == NULLFSBLOCK || + args.agno == XFS_FSB_TO_AGNO(mp, *firstblock) || + (flist->xbf_low && + args.agno > XFS_FSB_TO_AGNO(mp, *firstblock))); + *firstblock = cur->bc_private.b.firstblock = args.fsbno; + cur->bc_private.b.allocated++; + ip->i_d.di_nblocks++; + xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1L); + abp = xfs_btree_get_bufl(mp, tp, args.fsbno, 0); + /* + * Fill in the child block. + */ + abp->b_ops = &xfs_bmbt_buf_ops; + ablock = XFS_BUF_TO_BLOCK(abp); + if (xfs_sb_version_hascrc(&mp->m_sb)) + xfs_btree_init_block_int(mp, ablock, abp->b_bn, + XFS_BMAP_CRC_MAGIC, 0, 0, ip->i_ino, + XFS_BTREE_LONG_PTRS | XFS_BTREE_CRC_BLOCKS); + else + xfs_btree_init_block_int(mp, ablock, abp->b_bn, + XFS_BMAP_MAGIC, 0, 0, ip->i_ino, + XFS_BTREE_LONG_PTRS); + + arp = XFS_BMBT_REC_ADDR(mp, ablock, 1); + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + for (cnt = i = 0; i < nextents; i++) { + ep = xfs_iext_get_ext(ifp, i); + if (!isnullstartblock(xfs_bmbt_get_startblock(ep))) { + arp->l0 = cpu_to_be64(ep->l0); + arp->l1 = cpu_to_be64(ep->l1); + arp++; cnt++; + } + } + ASSERT(cnt == XFS_IFORK_NEXTENTS(ip, whichfork)); + xfs_btree_set_numrecs(ablock, cnt); + + /* + * Fill in the root key and pointer. + */ + kp = XFS_BMBT_KEY_ADDR(mp, block, 1); + arp = XFS_BMBT_REC_ADDR(mp, ablock, 1); + kp->br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(arp)); + pp = XFS_BMBT_PTR_ADDR(mp, block, 1, xfs_bmbt_get_maxrecs(cur, + be16_to_cpu(block->bb_level))); + *pp = cpu_to_be64(args.fsbno); + + /* + * Do all this logging at the end so that + * the root is at the right level. + */ + xfs_btree_log_block(cur, abp, XFS_BB_ALL_BITS); + xfs_btree_log_recs(cur, abp, 1, be16_to_cpu(ablock->bb_numrecs)); + ASSERT(*curp == NULL); + *curp = cur; + *logflagsp = XFS_ILOG_CORE | xfs_ilog_fbroot(whichfork); + return 0; +} + +/* + * Convert a local file to an extents file. + * This code is out of bounds for data forks of regular files, + * since the file data needs to get logged so things will stay consistent. + * (The bmap-level manipulations are ok, though). + */ +void +xfs_bmap_local_to_extents_empty( + struct xfs_inode *ip, + int whichfork) +{ + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + + ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL); + ASSERT(ifp->if_bytes == 0); + ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) == 0); + + xfs_bmap_forkoff_reset(ip, whichfork); + ifp->if_flags &= ~XFS_IFINLINE; + ifp->if_flags |= XFS_IFEXTENTS; + XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS); +} + + +STATIC int /* error */ +xfs_bmap_local_to_extents( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode pointer */ + xfs_fsblock_t *firstblock, /* first block allocated in xaction */ + xfs_extlen_t total, /* total blocks needed by transaction */ + int *logflagsp, /* inode logging flags */ + int whichfork, + void (*init_fn)(struct xfs_trans *tp, + struct xfs_buf *bp, + struct xfs_inode *ip, + struct xfs_ifork *ifp)) +{ + int error = 0; + int flags; /* logging flags returned */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_alloc_arg_t args; /* allocation arguments */ + xfs_buf_t *bp; /* buffer for extent block */ + xfs_bmbt_rec_host_t *ep; /* extent record pointer */ + + /* + * We don't want to deal with the case of keeping inode data inline yet. + * So sending the data fork of a regular inode is invalid. + */ + ASSERT(!(S_ISREG(ip->i_d.di_mode) && whichfork == XFS_DATA_FORK)); + ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL); + + if (!ifp->if_bytes) { + xfs_bmap_local_to_extents_empty(ip, whichfork); + flags = XFS_ILOG_CORE; + goto done; + } + + flags = 0; + error = 0; + ASSERT((ifp->if_flags & (XFS_IFINLINE|XFS_IFEXTENTS|XFS_IFEXTIREC)) == + XFS_IFINLINE); + memset(&args, 0, sizeof(args)); + args.tp = tp; + args.mp = ip->i_mount; + args.firstblock = *firstblock; + /* + * Allocate a block. We know we need only one, since the + * file currently fits in an inode. + */ + if (*firstblock == NULLFSBLOCK) { + args.fsbno = XFS_INO_TO_FSB(args.mp, ip->i_ino); + args.type = XFS_ALLOCTYPE_START_BNO; + } else { + args.fsbno = *firstblock; + args.type = XFS_ALLOCTYPE_NEAR_BNO; + } + args.total = total; + args.minlen = args.maxlen = args.prod = 1; + error = xfs_alloc_vextent(&args); + if (error) + goto done; + + /* Can't fail, the space was reserved. */ + ASSERT(args.fsbno != NULLFSBLOCK); + ASSERT(args.len == 1); + *firstblock = args.fsbno; + bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0); + + /* + * Initialise the block and copy the data + * + * Note: init_fn must set the buffer log item type correctly! + */ + init_fn(tp, bp, ip, ifp); + + /* account for the change in fork size and log everything */ + xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1); + xfs_idata_realloc(ip, -ifp->if_bytes, whichfork); + xfs_bmap_local_to_extents_empty(ip, whichfork); + flags |= XFS_ILOG_CORE; + + xfs_iext_add(ifp, 0, 1); + ep = xfs_iext_get_ext(ifp, 0); + xfs_bmbt_set_allf(ep, 0, args.fsbno, 1, XFS_EXT_NORM); + trace_xfs_bmap_post_update(ip, 0, + whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0, + _THIS_IP_); + XFS_IFORK_NEXT_SET(ip, whichfork, 1); + ip->i_d.di_nblocks = 1; + xfs_trans_mod_dquot_byino(tp, ip, + XFS_TRANS_DQ_BCOUNT, 1L); + flags |= xfs_ilog_fext(whichfork); + +done: + *logflagsp = flags; + return error; +} + +/* + * Called from xfs_bmap_add_attrfork to handle btree format files. + */ +STATIC int /* error */ +xfs_bmap_add_attrfork_btree( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode pointer */ + xfs_fsblock_t *firstblock, /* first block allocated */ + xfs_bmap_free_t *flist, /* blocks to free at commit */ + int *flags) /* inode logging flags */ +{ + xfs_btree_cur_t *cur; /* btree cursor */ + int error; /* error return value */ + xfs_mount_t *mp; /* file system mount struct */ + int stat; /* newroot status */ + + mp = ip->i_mount; + if (ip->i_df.if_broot_bytes <= XFS_IFORK_DSIZE(ip)) + *flags |= XFS_ILOG_DBROOT; + else { + cur = xfs_bmbt_init_cursor(mp, tp, ip, XFS_DATA_FORK); + cur->bc_private.b.flist = flist; + cur->bc_private.b.firstblock = *firstblock; + if ((error = xfs_bmbt_lookup_ge(cur, 0, 0, 0, &stat))) + goto error0; + /* must be at least one entry */ + XFS_WANT_CORRUPTED_GOTO(mp, stat == 1, error0); + if ((error = xfs_btree_new_iroot(cur, flags, &stat))) + goto error0; + if (stat == 0) { + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + return -ENOSPC; + } + *firstblock = cur->bc_private.b.firstblock; + cur->bc_private.b.allocated = 0; + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + } + return 0; +error0: + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + return error; +} + +/* + * Called from xfs_bmap_add_attrfork to handle extents format files. + */ +STATIC int /* error */ +xfs_bmap_add_attrfork_extents( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode pointer */ + xfs_fsblock_t *firstblock, /* first block allocated */ + xfs_bmap_free_t *flist, /* blocks to free at commit */ + int *flags) /* inode logging flags */ +{ + xfs_btree_cur_t *cur; /* bmap btree cursor */ + int error; /* error return value */ + + if (ip->i_d.di_nextents * sizeof(xfs_bmbt_rec_t) <= XFS_IFORK_DSIZE(ip)) + return 0; + cur = NULL; + error = xfs_bmap_extents_to_btree(tp, ip, firstblock, flist, &cur, 0, + flags, XFS_DATA_FORK); + if (cur) { + cur->bc_private.b.allocated = 0; + xfs_btree_del_cursor(cur, + error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); + } + return error; +} + +/* + * Called from xfs_bmap_add_attrfork to handle local format files. Each + * different data fork content type needs a different callout to do the + * conversion. Some are basic and only require special block initialisation + * callouts for the data formating, others (directories) are so specialised they + * handle everything themselves. + * + * XXX (dgc): investigate whether directory conversion can use the generic + * formatting callout. It should be possible - it's just a very complex + * formatter. + */ +STATIC int /* error */ +xfs_bmap_add_attrfork_local( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode pointer */ + xfs_fsblock_t *firstblock, /* first block allocated */ + xfs_bmap_free_t *flist, /* blocks to free at commit */ + int *flags) /* inode logging flags */ +{ + xfs_da_args_t dargs; /* args for dir/attr code */ + + if (ip->i_df.if_bytes <= XFS_IFORK_DSIZE(ip)) + return 0; + + if (S_ISDIR(ip->i_d.di_mode)) { + memset(&dargs, 0, sizeof(dargs)); + dargs.geo = ip->i_mount->m_dir_geo; + dargs.dp = ip; + dargs.firstblock = firstblock; + dargs.flist = flist; + dargs.total = dargs.geo->fsbcount; + dargs.whichfork = XFS_DATA_FORK; + dargs.trans = tp; + return xfs_dir2_sf_to_block(&dargs); + } + + if (S_ISLNK(ip->i_d.di_mode)) + return xfs_bmap_local_to_extents(tp, ip, firstblock, 1, + flags, XFS_DATA_FORK, + xfs_symlink_local_to_remote); + + /* should only be called for types that support local format data */ + ASSERT(0); + return -EFSCORRUPTED; +} + +/* + * Convert inode from non-attributed to attributed. + * Must not be in a transaction, ip must not be locked. + */ +int /* error code */ +xfs_bmap_add_attrfork( + xfs_inode_t *ip, /* incore inode pointer */ + int size, /* space new attribute needs */ + int rsvd) /* xact may use reserved blks */ +{ + xfs_fsblock_t firstblock; /* 1st block/ag allocated */ + xfs_bmap_free_t flist; /* freed extent records */ + xfs_mount_t *mp; /* mount structure */ + xfs_trans_t *tp; /* transaction pointer */ + int blks; /* space reservation */ + int version = 1; /* superblock attr version */ + int committed; /* xaction was committed */ + int logflags; /* logging flags */ + int error; /* error return value */ + int cancel_flags = 0; + + ASSERT(XFS_IFORK_Q(ip) == 0); + + mp = ip->i_mount; + ASSERT(!XFS_NOT_DQATTACHED(mp, ip)); + tp = xfs_trans_alloc(mp, XFS_TRANS_ADDAFORK); + blks = XFS_ADDAFORK_SPACE_RES(mp); + if (rsvd) + tp->t_flags |= XFS_TRANS_RESERVE; + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_addafork, blks, 0); + if (error) { + xfs_trans_cancel(tp, 0); + return error; + } + cancel_flags = XFS_TRANS_RELEASE_LOG_RES; + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_trans_reserve_quota_nblks(tp, ip, blks, 0, rsvd ? + XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES : + XFS_QMOPT_RES_REGBLKS); + if (error) + goto trans_cancel; + cancel_flags |= XFS_TRANS_ABORT; + if (XFS_IFORK_Q(ip)) + goto trans_cancel; + if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS) { + /* + * For inodes coming from pre-6.2 filesystems. + */ + ASSERT(ip->i_d.di_aformat == 0); + ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; + } + ASSERT(ip->i_d.di_anextents == 0); + + xfs_trans_ijoin(tp, ip, 0); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + switch (ip->i_d.di_format) { + case XFS_DINODE_FMT_DEV: + ip->i_d.di_forkoff = roundup(sizeof(xfs_dev_t), 8) >> 3; + break; + case XFS_DINODE_FMT_UUID: + ip->i_d.di_forkoff = roundup(sizeof(uuid_t), 8) >> 3; + break; + case XFS_DINODE_FMT_LOCAL: + case XFS_DINODE_FMT_EXTENTS: + case XFS_DINODE_FMT_BTREE: + ip->i_d.di_forkoff = xfs_attr_shortform_bytesfit(ip, size); + if (!ip->i_d.di_forkoff) + ip->i_d.di_forkoff = xfs_default_attroffset(ip) >> 3; + else if (mp->m_flags & XFS_MOUNT_ATTR2) + version = 2; + break; + default: + ASSERT(0); + error = -EINVAL; + goto trans_cancel; + } + + ASSERT(ip->i_afp == NULL); + ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP); + ip->i_afp->if_flags = XFS_IFEXTENTS; + logflags = 0; + xfs_bmap_init(&flist, &firstblock); + switch (ip->i_d.di_format) { + case XFS_DINODE_FMT_LOCAL: + error = xfs_bmap_add_attrfork_local(tp, ip, &firstblock, &flist, + &logflags); + break; + case XFS_DINODE_FMT_EXTENTS: + error = xfs_bmap_add_attrfork_extents(tp, ip, &firstblock, + &flist, &logflags); + break; + case XFS_DINODE_FMT_BTREE: + error = xfs_bmap_add_attrfork_btree(tp, ip, &firstblock, &flist, + &logflags); + break; + default: + error = 0; + break; + } + if (logflags) + xfs_trans_log_inode(tp, ip, logflags); + if (error) + goto bmap_cancel; + if (!xfs_sb_version_hasattr(&mp->m_sb) || + (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2)) { + bool log_sb = false; + + spin_lock(&mp->m_sb_lock); + if (!xfs_sb_version_hasattr(&mp->m_sb)) { + xfs_sb_version_addattr(&mp->m_sb); + log_sb = true; + } + if (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2) { + xfs_sb_version_addattr2(&mp->m_sb); + log_sb = true; + } + spin_unlock(&mp->m_sb_lock); + if (log_sb) + xfs_log_sb(tp); + } + + error = xfs_bmap_finish(&tp, &flist, &committed); + if (error) + goto bmap_cancel; + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; + +bmap_cancel: + xfs_bmap_cancel(&flist); +trans_cancel: + xfs_trans_cancel(tp, cancel_flags); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; +} + +/* + * Internal and external extent tree search functions. + */ + +/* + * Read in the extents to if_extents. + * All inode fields are set up by caller, we just traverse the btree + * and copy the records in. If the file system cannot contain unwritten + * extents, the records are checked for no "state" flags. + */ +int /* error */ +xfs_bmap_read_extents( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode */ + int whichfork) /* data or attr fork */ +{ + struct xfs_btree_block *block; /* current btree block */ + xfs_fsblock_t bno; /* block # of "block" */ + xfs_buf_t *bp; /* buffer for "block" */ + int error; /* error return value */ + xfs_exntfmt_t exntf; /* XFS_EXTFMT_NOSTATE, if checking */ + xfs_extnum_t i, j; /* index into the extents list */ + xfs_ifork_t *ifp; /* fork structure */ + int level; /* btree level, for checking */ + xfs_mount_t *mp; /* file system mount structure */ + __be64 *pp; /* pointer to block address */ + /* REFERENCED */ + xfs_extnum_t room; /* number of entries there's room for */ + + bno = NULLFSBLOCK; + mp = ip->i_mount; + ifp = XFS_IFORK_PTR(ip, whichfork); + exntf = (whichfork != XFS_DATA_FORK) ? XFS_EXTFMT_NOSTATE : + XFS_EXTFMT_INODE(ip); + block = ifp->if_broot; + /* + * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out. + */ + level = be16_to_cpu(block->bb_level); + ASSERT(level > 0); + pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes); + bno = be64_to_cpu(*pp); + ASSERT(bno != NULLFSBLOCK); + ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount); + ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks); + /* + * Go down the tree until leaf level is reached, following the first + * pointer (leftmost) at each level. + */ + while (level-- > 0) { + error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, + XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops); + if (error) + return error; + block = XFS_BUF_TO_BLOCK(bp); + if (level == 0) + break; + pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]); + bno = be64_to_cpu(*pp); + XFS_WANT_CORRUPTED_GOTO(mp, + XFS_FSB_SANITY_CHECK(mp, bno), error0); + xfs_trans_brelse(tp, bp); + } + /* + * Here with bp and block set to the leftmost leaf node in the tree. + */ + room = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + i = 0; + /* + * Loop over all leaf nodes. Copy information to the extent records. + */ + for (;;) { + xfs_bmbt_rec_t *frp; + xfs_fsblock_t nextbno; + xfs_extnum_t num_recs; + xfs_extnum_t start; + + num_recs = xfs_btree_get_numrecs(block); + if (unlikely(i + num_recs > room)) { + ASSERT(i + num_recs <= room); + xfs_warn(ip->i_mount, + "corrupt dinode %Lu, (btree extents).", + (unsigned long long) ip->i_ino); + XFS_CORRUPTION_ERROR("xfs_bmap_read_extents(1)", + XFS_ERRLEVEL_LOW, ip->i_mount, block); + goto error0; + } + /* + * Read-ahead the next leaf block, if any. + */ + nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); + if (nextbno != NULLFSBLOCK) + xfs_btree_reada_bufl(mp, nextbno, 1, + &xfs_bmbt_buf_ops); + /* + * Copy records into the extent records. + */ + frp = XFS_BMBT_REC_ADDR(mp, block, 1); + start = i; + for (j = 0; j < num_recs; j++, i++, frp++) { + xfs_bmbt_rec_host_t *trp = xfs_iext_get_ext(ifp, i); + trp->l0 = be64_to_cpu(frp->l0); + trp->l1 = be64_to_cpu(frp->l1); + } + if (exntf == XFS_EXTFMT_NOSTATE) { + /* + * Check all attribute bmap btree records and + * any "older" data bmap btree records for a + * set bit in the "extent flag" position. + */ + if (unlikely(xfs_check_nostate_extents(ifp, + start, num_recs))) { + XFS_ERROR_REPORT("xfs_bmap_read_extents(2)", + XFS_ERRLEVEL_LOW, + ip->i_mount); + goto error0; + } + } + xfs_trans_brelse(tp, bp); + bno = nextbno; + /* + * If we've reached the end, stop. + */ + if (bno == NULLFSBLOCK) + break; + error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, + XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops); + if (error) + return error; + block = XFS_BUF_TO_BLOCK(bp); + } + ASSERT(i == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))); + ASSERT(i == XFS_IFORK_NEXTENTS(ip, whichfork)); + XFS_BMAP_TRACE_EXLIST(ip, i, whichfork); + return 0; +error0: + xfs_trans_brelse(tp, bp); + return -EFSCORRUPTED; +} + + +/* + * Search the extent records for the entry containing block bno. + * If bno lies in a hole, point to the next entry. If bno lies + * past eof, *eofp will be set, and *prevp will contain the last + * entry (null if none). Else, *lastxp will be set to the index + * of the found entry; *gotp will contain the entry. + */ +STATIC xfs_bmbt_rec_host_t * /* pointer to found extent entry */ +xfs_bmap_search_multi_extents( + xfs_ifork_t *ifp, /* inode fork pointer */ + xfs_fileoff_t bno, /* block number searched for */ + int *eofp, /* out: end of file found */ + xfs_extnum_t *lastxp, /* out: last extent index */ + xfs_bmbt_irec_t *gotp, /* out: extent entry found */ + xfs_bmbt_irec_t *prevp) /* out: previous extent entry found */ +{ + xfs_bmbt_rec_host_t *ep; /* extent record pointer */ + xfs_extnum_t lastx; /* last extent index */ + + /* + * Initialize the extent entry structure to catch access to + * uninitialized br_startblock field. + */ + gotp->br_startoff = 0xffa5a5a5a5a5a5a5LL; + gotp->br_blockcount = 0xa55a5a5a5a5a5a5aLL; + gotp->br_state = XFS_EXT_INVALID; + gotp->br_startblock = 0xffffa5a5a5a5a5a5LL; + prevp->br_startoff = NULLFILEOFF; + + ep = xfs_iext_bno_to_ext(ifp, bno, &lastx); + if (lastx > 0) { + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx - 1), prevp); + } + if (lastx < (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))) { + xfs_bmbt_get_all(ep, gotp); + *eofp = 0; + } else { + if (lastx > 0) { + *gotp = *prevp; + } + *eofp = 1; + ep = NULL; + } + *lastxp = lastx; + return ep; +} + +/* + * Search the extents list for the inode, for the extent containing bno. + * If bno lies in a hole, point to the next entry. If bno lies past eof, + * *eofp will be set, and *prevp will contain the last entry (null if none). + * Else, *lastxp will be set to the index of the found + * entry; *gotp will contain the entry. + */ +STATIC xfs_bmbt_rec_host_t * /* pointer to found extent entry */ +xfs_bmap_search_extents( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_fileoff_t bno, /* block number searched for */ + int fork, /* data or attr fork */ + int *eofp, /* out: end of file found */ + xfs_extnum_t *lastxp, /* out: last extent index */ + xfs_bmbt_irec_t *gotp, /* out: extent entry found */ + xfs_bmbt_irec_t *prevp) /* out: previous extent entry found */ +{ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_bmbt_rec_host_t *ep; /* extent record pointer */ + + XFS_STATS_INC(xs_look_exlist); + ifp = XFS_IFORK_PTR(ip, fork); + + ep = xfs_bmap_search_multi_extents(ifp, bno, eofp, lastxp, gotp, prevp); + + if (unlikely(!(gotp->br_startblock) && (*lastxp != NULLEXTNUM) && + !(XFS_IS_REALTIME_INODE(ip) && fork == XFS_DATA_FORK))) { + xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO, + "Access to block zero in inode %llu " + "start_block: %llx start_off: %llx " + "blkcnt: %llx extent-state: %x lastx: %x", + (unsigned long long)ip->i_ino, + (unsigned long long)gotp->br_startblock, + (unsigned long long)gotp->br_startoff, + (unsigned long long)gotp->br_blockcount, + gotp->br_state, *lastxp); + *lastxp = NULLEXTNUM; + *eofp = 1; + return NULL; + } + return ep; +} + +/* + * Returns the file-relative block number of the first unused block(s) + * in the file with at least "len" logically contiguous blocks free. + * This is the lowest-address hole if the file has holes, else the first block + * past the end of file. + * Return 0 if the file is currently local (in-inode). + */ +int /* error */ +xfs_bmap_first_unused( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode */ + xfs_extlen_t len, /* size of hole to find */ + xfs_fileoff_t *first_unused, /* unused block */ + int whichfork) /* data or attr fork */ +{ + int error; /* error return value */ + int idx; /* extent record index */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_fileoff_t lastaddr; /* last block number seen */ + xfs_fileoff_t lowest; /* lowest useful block */ + xfs_fileoff_t max; /* starting useful block */ + xfs_fileoff_t off; /* offset for this block */ + xfs_extnum_t nextents; /* number of extent entries */ + + ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE || + XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS || + XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL); + if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { + *first_unused = 0; + return 0; + } + ifp = XFS_IFORK_PTR(ip, whichfork); + if (!(ifp->if_flags & XFS_IFEXTENTS) && + (error = xfs_iread_extents(tp, ip, whichfork))) + return error; + lowest = *first_unused; + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + for (idx = 0, lastaddr = 0, max = lowest; idx < nextents; idx++) { + xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, idx); + off = xfs_bmbt_get_startoff(ep); + /* + * See if the hole before this extent will work. + */ + if (off >= lowest + len && off - max >= len) { + *first_unused = max; + return 0; + } + lastaddr = off + xfs_bmbt_get_blockcount(ep); + max = XFS_FILEOFF_MAX(lastaddr, lowest); + } + *first_unused = max; + return 0; +} + +/* + * Returns the file-relative block number of the last block - 1 before + * last_block (input value) in the file. + * This is not based on i_size, it is based on the extent records. + * Returns 0 for local files, as they do not have extent records. + */ +int /* error */ +xfs_bmap_last_before( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode */ + xfs_fileoff_t *last_block, /* last block */ + int whichfork) /* data or attr fork */ +{ + xfs_fileoff_t bno; /* input file offset */ + int eof; /* hit end of file */ + xfs_bmbt_rec_host_t *ep; /* pointer to last extent */ + int error; /* error return value */ + xfs_bmbt_irec_t got; /* current extent value */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_extnum_t lastx; /* last extent used */ + xfs_bmbt_irec_t prev; /* previous extent value */ + + if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL) + return -EIO; + if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { + *last_block = 0; + return 0; + } + ifp = XFS_IFORK_PTR(ip, whichfork); + if (!(ifp->if_flags & XFS_IFEXTENTS) && + (error = xfs_iread_extents(tp, ip, whichfork))) + return error; + bno = *last_block - 1; + ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, + &prev); + if (eof || xfs_bmbt_get_startoff(ep) > bno) { + if (prev.br_startoff == NULLFILEOFF) + *last_block = 0; + else + *last_block = prev.br_startoff + prev.br_blockcount; + } + /* + * Otherwise *last_block is already the right answer. + */ + return 0; +} + +int +xfs_bmap_last_extent( + struct xfs_trans *tp, + struct xfs_inode *ip, + int whichfork, + struct xfs_bmbt_irec *rec, + int *is_empty) +{ + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + int error; + int nextents; + + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + error = xfs_iread_extents(tp, ip, whichfork); + if (error) + return error; + } + + nextents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); + if (nextents == 0) { + *is_empty = 1; + return 0; + } + + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, nextents - 1), rec); + *is_empty = 0; + return 0; +} + +/* + * Check the last inode extent to determine whether this allocation will result + * in blocks being allocated at the end of the file. When we allocate new data + * blocks at the end of the file which do not start at the previous data block, + * we will try to align the new blocks at stripe unit boundaries. + * + * Returns 1 in bma->aeof if the file (fork) is empty as any new write will be + * at, or past the EOF. + */ +STATIC int +xfs_bmap_isaeof( + struct xfs_bmalloca *bma, + int whichfork) +{ + struct xfs_bmbt_irec rec; + int is_empty; + int error; + + bma->aeof = 0; + error = xfs_bmap_last_extent(NULL, bma->ip, whichfork, &rec, + &is_empty); + if (error) + return error; + + if (is_empty) { + bma->aeof = 1; + return 0; + } + + /* + * Check if we are allocation or past the last extent, or at least into + * the last delayed allocated extent. + */ + bma->aeof = bma->offset >= rec.br_startoff + rec.br_blockcount || + (bma->offset >= rec.br_startoff && + isnullstartblock(rec.br_startblock)); + return 0; +} + +/* + * Returns the file-relative block number of the first block past eof in + * the file. This is not based on i_size, it is based on the extent records. + * Returns 0 for local files, as they do not have extent records. + */ +int +xfs_bmap_last_offset( + struct xfs_inode *ip, + xfs_fileoff_t *last_block, + int whichfork) +{ + struct xfs_bmbt_irec rec; + int is_empty; + int error; + + *last_block = 0; + + if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) + return 0; + + if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS) + return -EIO; + + error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, &is_empty); + if (error || is_empty) + return error; + + *last_block = rec.br_startoff + rec.br_blockcount; + return 0; +} + +/* + * Returns whether the selected fork of the inode has exactly one + * block or not. For the data fork we check this matches di_size, + * implying the file's range is 0..bsize-1. + */ +int /* 1=>1 block, 0=>otherwise */ +xfs_bmap_one_block( + xfs_inode_t *ip, /* incore inode */ + int whichfork) /* data or attr fork */ +{ + xfs_bmbt_rec_host_t *ep; /* ptr to fork's extent */ + xfs_ifork_t *ifp; /* inode fork pointer */ + int rval; /* return value */ + xfs_bmbt_irec_t s; /* internal version of extent */ + +#ifndef DEBUG + if (whichfork == XFS_DATA_FORK) + return XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize; +#endif /* !DEBUG */ + if (XFS_IFORK_NEXTENTS(ip, whichfork) != 1) + return 0; + if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS) + return 0; + ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT(ifp->if_flags & XFS_IFEXTENTS); + ep = xfs_iext_get_ext(ifp, 0); + xfs_bmbt_get_all(ep, &s); + rval = s.br_startoff == 0 && s.br_blockcount == 1; + if (rval && whichfork == XFS_DATA_FORK) + ASSERT(XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize); + return rval; +} + +/* + * Extent tree manipulation functions used during allocation. + */ + +/* + * Convert a delayed allocation to a real allocation. + */ +STATIC int /* error */ +xfs_bmap_add_extent_delay_real( + struct xfs_bmalloca *bma) +{ + struct xfs_bmbt_irec *new = &bma->got; + int diff; /* temp value */ + xfs_bmbt_rec_host_t *ep; /* extent entry for idx */ + int error; /* error return value */ + int i; /* temp state */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_fileoff_t new_endoff; /* end offset of new entry */ + xfs_bmbt_irec_t r[3]; /* neighbor extent entries */ + /* left is 0, right is 1, prev is 2 */ + int rval=0; /* return value (logging flags) */ + int state = 0;/* state bits, accessed thru macros */ + xfs_filblks_t da_new; /* new count del alloc blocks used */ + xfs_filblks_t da_old; /* old count del alloc blocks used */ + xfs_filblks_t temp=0; /* value for da_new calculations */ + xfs_filblks_t temp2=0;/* value for da_new calculations */ + int tmp_rval; /* partial logging flags */ + struct xfs_mount *mp; + + mp = bma->tp ? bma->tp->t_mountp : NULL; + ifp = XFS_IFORK_PTR(bma->ip, XFS_DATA_FORK); + + ASSERT(bma->idx >= 0); + ASSERT(bma->idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec)); + ASSERT(!isnullstartblock(new->br_startblock)); + ASSERT(!bma->cur || + (bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL)); + + XFS_STATS_INC(xs_add_exlist); + +#define LEFT r[0] +#define RIGHT r[1] +#define PREV r[2] + + /* + * Set up a bunch of variables to make the tests simpler. + */ + ep = xfs_iext_get_ext(ifp, bma->idx); + xfs_bmbt_get_all(ep, &PREV); + new_endoff = new->br_startoff + new->br_blockcount; + ASSERT(PREV.br_startoff <= new->br_startoff); + ASSERT(PREV.br_startoff + PREV.br_blockcount >= new_endoff); + + da_old = startblockval(PREV.br_startblock); + da_new = 0; + + /* + * Set flags determining what part of the previous delayed allocation + * extent is being replaced by a real allocation. + */ + if (PREV.br_startoff == new->br_startoff) + state |= BMAP_LEFT_FILLING; + if (PREV.br_startoff + PREV.br_blockcount == new_endoff) + state |= BMAP_RIGHT_FILLING; + + /* + * Check and set flags if this segment has a left neighbor. + * Don't set contiguous if the combined extent would be too large. + */ + if (bma->idx > 0) { + state |= BMAP_LEFT_VALID; + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1), &LEFT); + + if (isnullstartblock(LEFT.br_startblock)) + state |= BMAP_LEFT_DELAY; + } + + if ((state & BMAP_LEFT_VALID) && !(state & BMAP_LEFT_DELAY) && + LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff && + LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock && + LEFT.br_state == new->br_state && + LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN) + state |= BMAP_LEFT_CONTIG; + + /* + * Check and set flags if this segment has a right neighbor. + * Don't set contiguous if the combined extent would be too large. + * Also check for all-three-contiguous being too large. + */ + if (bma->idx < bma->ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) { + state |= BMAP_RIGHT_VALID; + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx + 1), &RIGHT); + + if (isnullstartblock(RIGHT.br_startblock)) + state |= BMAP_RIGHT_DELAY; + } + + if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) && + new_endoff == RIGHT.br_startoff && + new->br_startblock + new->br_blockcount == RIGHT.br_startblock && + new->br_state == RIGHT.br_state && + new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN && + ((state & (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING | + BMAP_RIGHT_FILLING)) != + (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING | + BMAP_RIGHT_FILLING) || + LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount + <= MAXEXTLEN)) + state |= BMAP_RIGHT_CONTIG; + + error = 0; + /* + * Switch out based on the FILLING and CONTIG state bits. + */ + switch (state & (BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | + BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG)) { + case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | + BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG: + /* + * Filling in all of a previously delayed allocation extent. + * The left and right neighbors are both contiguous with new. + */ + bma->idx--; + trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx), + LEFT.br_blockcount + PREV.br_blockcount + + RIGHT.br_blockcount); + trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); + + xfs_iext_remove(bma->ip, bma->idx + 1, 2, state); + bma->ip->i_d.di_nextents--; + if (bma->cur == NULL) + rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; + else { + rval = XFS_ILOG_CORE; + error = xfs_bmbt_lookup_eq(bma->cur, RIGHT.br_startoff, + RIGHT.br_startblock, + RIGHT.br_blockcount, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + error = xfs_btree_delete(bma->cur, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + error = xfs_btree_decrement(bma->cur, 0, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + error = xfs_bmbt_update(bma->cur, LEFT.br_startoff, + LEFT.br_startblock, + LEFT.br_blockcount + + PREV.br_blockcount + + RIGHT.br_blockcount, LEFT.br_state); + if (error) + goto done; + } + break; + + case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG: + /* + * Filling in all of a previously delayed allocation extent. + * The left neighbor is contiguous, the right is not. + */ + bma->idx--; + + trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx), + LEFT.br_blockcount + PREV.br_blockcount); + trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); + + xfs_iext_remove(bma->ip, bma->idx + 1, 1, state); + if (bma->cur == NULL) + rval = XFS_ILOG_DEXT; + else { + rval = 0; + error = xfs_bmbt_lookup_eq(bma->cur, LEFT.br_startoff, + LEFT.br_startblock, LEFT.br_blockcount, + &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + error = xfs_bmbt_update(bma->cur, LEFT.br_startoff, + LEFT.br_startblock, + LEFT.br_blockcount + + PREV.br_blockcount, LEFT.br_state); + if (error) + goto done; + } + break; + + case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG: + /* + * Filling in all of a previously delayed allocation extent. + * The right neighbor is contiguous, the left is not. + */ + trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); + xfs_bmbt_set_startblock(ep, new->br_startblock); + xfs_bmbt_set_blockcount(ep, + PREV.br_blockcount + RIGHT.br_blockcount); + trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); + + xfs_iext_remove(bma->ip, bma->idx + 1, 1, state); + if (bma->cur == NULL) + rval = XFS_ILOG_DEXT; + else { + rval = 0; + error = xfs_bmbt_lookup_eq(bma->cur, RIGHT.br_startoff, + RIGHT.br_startblock, + RIGHT.br_blockcount, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + error = xfs_bmbt_update(bma->cur, PREV.br_startoff, + new->br_startblock, + PREV.br_blockcount + + RIGHT.br_blockcount, PREV.br_state); + if (error) + goto done; + } + break; + + case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING: + /* + * Filling in all of a previously delayed allocation extent. + * Neither the left nor right neighbors are contiguous with + * the new one. + */ + trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); + xfs_bmbt_set_startblock(ep, new->br_startblock); + trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); + + bma->ip->i_d.di_nextents++; + if (bma->cur == NULL) + rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; + else { + rval = XFS_ILOG_CORE; + error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff, + new->br_startblock, new->br_blockcount, + &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done); + bma->cur->bc_rec.b.br_state = XFS_EXT_NORM; + error = xfs_btree_insert(bma->cur, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + } + break; + + case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG: + /* + * Filling in the first part of a previous delayed allocation. + * The left neighbor is contiguous. + */ + trace_xfs_bmap_pre_update(bma->ip, bma->idx - 1, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx - 1), + LEFT.br_blockcount + new->br_blockcount); + xfs_bmbt_set_startoff(ep, + PREV.br_startoff + new->br_blockcount); + trace_xfs_bmap_post_update(bma->ip, bma->idx - 1, state, _THIS_IP_); + + temp = PREV.br_blockcount - new->br_blockcount; + trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(ep, temp); + if (bma->cur == NULL) + rval = XFS_ILOG_DEXT; + else { + rval = 0; + error = xfs_bmbt_lookup_eq(bma->cur, LEFT.br_startoff, + LEFT.br_startblock, LEFT.br_blockcount, + &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + error = xfs_bmbt_update(bma->cur, LEFT.br_startoff, + LEFT.br_startblock, + LEFT.br_blockcount + + new->br_blockcount, + LEFT.br_state); + if (error) + goto done; + } + da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp), + startblockval(PREV.br_startblock)); + xfs_bmbt_set_startblock(ep, nullstartblock(da_new)); + trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); + + bma->idx--; + break; + + case BMAP_LEFT_FILLING: + /* + * Filling in the first part of a previous delayed allocation. + * The left neighbor is not contiguous. + */ + trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); + xfs_bmbt_set_startoff(ep, new_endoff); + temp = PREV.br_blockcount - new->br_blockcount; + xfs_bmbt_set_blockcount(ep, temp); + xfs_iext_insert(bma->ip, bma->idx, 1, new, state); + bma->ip->i_d.di_nextents++; + if (bma->cur == NULL) + rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; + else { + rval = XFS_ILOG_CORE; + error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff, + new->br_startblock, new->br_blockcount, + &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done); + bma->cur->bc_rec.b.br_state = XFS_EXT_NORM; + error = xfs_btree_insert(bma->cur, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + } + + if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) { + error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, + bma->firstblock, bma->flist, + &bma->cur, 1, &tmp_rval, XFS_DATA_FORK); + rval |= tmp_rval; + if (error) + goto done; + } + da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp), + startblockval(PREV.br_startblock) - + (bma->cur ? bma->cur->bc_private.b.allocated : 0)); + ep = xfs_iext_get_ext(ifp, bma->idx + 1); + xfs_bmbt_set_startblock(ep, nullstartblock(da_new)); + trace_xfs_bmap_post_update(bma->ip, bma->idx + 1, state, _THIS_IP_); + break; + + case BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG: + /* + * Filling in the last part of a previous delayed allocation. + * The right neighbor is contiguous with the new allocation. + */ + temp = PREV.br_blockcount - new->br_blockcount; + trace_xfs_bmap_pre_update(bma->ip, bma->idx + 1, state, _THIS_IP_); + xfs_bmbt_set_blockcount(ep, temp); + xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, bma->idx + 1), + new->br_startoff, new->br_startblock, + new->br_blockcount + RIGHT.br_blockcount, + RIGHT.br_state); + trace_xfs_bmap_post_update(bma->ip, bma->idx + 1, state, _THIS_IP_); + if (bma->cur == NULL) + rval = XFS_ILOG_DEXT; + else { + rval = 0; + error = xfs_bmbt_lookup_eq(bma->cur, RIGHT.br_startoff, + RIGHT.br_startblock, + RIGHT.br_blockcount, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + error = xfs_bmbt_update(bma->cur, new->br_startoff, + new->br_startblock, + new->br_blockcount + + RIGHT.br_blockcount, + RIGHT.br_state); + if (error) + goto done; + } + + da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp), + startblockval(PREV.br_startblock)); + trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); + xfs_bmbt_set_startblock(ep, nullstartblock(da_new)); + trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); + + bma->idx++; + break; + + case BMAP_RIGHT_FILLING: + /* + * Filling in the last part of a previous delayed allocation. + * The right neighbor is not contiguous. + */ + temp = PREV.br_blockcount - new->br_blockcount; + trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(ep, temp); + xfs_iext_insert(bma->ip, bma->idx + 1, 1, new, state); + bma->ip->i_d.di_nextents++; + if (bma->cur == NULL) + rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; + else { + rval = XFS_ILOG_CORE; + error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff, + new->br_startblock, new->br_blockcount, + &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done); + bma->cur->bc_rec.b.br_state = XFS_EXT_NORM; + error = xfs_btree_insert(bma->cur, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + } + + if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) { + error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, + bma->firstblock, bma->flist, &bma->cur, 1, + &tmp_rval, XFS_DATA_FORK); + rval |= tmp_rval; + if (error) + goto done; + } + da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp), + startblockval(PREV.br_startblock) - + (bma->cur ? bma->cur->bc_private.b.allocated : 0)); + ep = xfs_iext_get_ext(ifp, bma->idx); + xfs_bmbt_set_startblock(ep, nullstartblock(da_new)); + trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); + + bma->idx++; + break; + + case 0: + /* + * Filling in the middle part of a previous delayed allocation. + * Contiguity is impossible here. + * This case is avoided almost all the time. + * + * We start with a delayed allocation: + * + * +ddddddddddddddddddddddddddddddddddddddddddddddddddddddd+ + * PREV @ idx + * + * and we are allocating: + * +rrrrrrrrrrrrrrrrr+ + * new + * + * and we set it up for insertion as: + * +ddddddddddddddddddd+rrrrrrrrrrrrrrrrr+ddddddddddddddddd+ + * new + * PREV @ idx LEFT RIGHT + * inserted at idx + 1 + */ + temp = new->br_startoff - PREV.br_startoff; + temp2 = PREV.br_startoff + PREV.br_blockcount - new_endoff; + trace_xfs_bmap_pre_update(bma->ip, bma->idx, 0, _THIS_IP_); + xfs_bmbt_set_blockcount(ep, temp); /* truncate PREV */ + LEFT = *new; + RIGHT.br_state = PREV.br_state; + RIGHT.br_startblock = nullstartblock( + (int)xfs_bmap_worst_indlen(bma->ip, temp2)); + RIGHT.br_startoff = new_endoff; + RIGHT.br_blockcount = temp2; + /* insert LEFT (r[0]) and RIGHT (r[1]) at the same time */ + xfs_iext_insert(bma->ip, bma->idx + 1, 2, &LEFT, state); + bma->ip->i_d.di_nextents++; + if (bma->cur == NULL) + rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; + else { + rval = XFS_ILOG_CORE; + error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff, + new->br_startblock, new->br_blockcount, + &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done); + bma->cur->bc_rec.b.br_state = XFS_EXT_NORM; + error = xfs_btree_insert(bma->cur, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + } + + if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) { + error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, + bma->firstblock, bma->flist, &bma->cur, + 1, &tmp_rval, XFS_DATA_FORK); + rval |= tmp_rval; + if (error) + goto done; + } + temp = xfs_bmap_worst_indlen(bma->ip, temp); + temp2 = xfs_bmap_worst_indlen(bma->ip, temp2); + diff = (int)(temp + temp2 - startblockval(PREV.br_startblock) - + (bma->cur ? bma->cur->bc_private.b.allocated : 0)); + if (diff > 0) { + error = xfs_mod_fdblocks(bma->ip->i_mount, + -((int64_t)diff), false); + ASSERT(!error); + if (error) + goto done; + } + + ep = xfs_iext_get_ext(ifp, bma->idx); + xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); + trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); + trace_xfs_bmap_pre_update(bma->ip, bma->idx + 2, state, _THIS_IP_); + xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, bma->idx + 2), + nullstartblock((int)temp2)); + trace_xfs_bmap_post_update(bma->ip, bma->idx + 2, state, _THIS_IP_); + + bma->idx++; + da_new = temp + temp2; + break; + + case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: + case BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: + case BMAP_LEFT_FILLING | BMAP_RIGHT_CONTIG: + case BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG: + case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: + case BMAP_LEFT_CONTIG: + case BMAP_RIGHT_CONTIG: + /* + * These cases are all impossible. + */ + ASSERT(0); + } + + /* convert to a btree if necessary */ + if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) { + int tmp_logflags; /* partial log flag return val */ + + ASSERT(bma->cur == NULL); + error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, + bma->firstblock, bma->flist, &bma->cur, + da_old > 0, &tmp_logflags, XFS_DATA_FORK); + bma->logflags |= tmp_logflags; + if (error) + goto done; + } + + /* adjust for changes in reserved delayed indirect blocks */ + if (da_old || da_new) { + temp = da_new; + if (bma->cur) + temp += bma->cur->bc_private.b.allocated; + ASSERT(temp <= da_old); + if (temp < da_old) + xfs_mod_fdblocks(bma->ip->i_mount, + (int64_t)(da_old - temp), false); + } + + /* clear out the allocated field, done with it now in any case. */ + if (bma->cur) + bma->cur->bc_private.b.allocated = 0; + + xfs_bmap_check_leaf_extents(bma->cur, bma->ip, XFS_DATA_FORK); +done: + bma->logflags |= rval; + return error; +#undef LEFT +#undef RIGHT +#undef PREV +} + +/* + * Convert an unwritten allocation to a real allocation or vice versa. + */ +STATIC int /* error */ +xfs_bmap_add_extent_unwritten_real( + struct xfs_trans *tp, + xfs_inode_t *ip, /* incore inode pointer */ + xfs_extnum_t *idx, /* extent number to update/insert */ + xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ + xfs_bmbt_irec_t *new, /* new data to add to file extents */ + xfs_fsblock_t *first, /* pointer to firstblock variable */ + xfs_bmap_free_t *flist, /* list of extents to be freed */ + int *logflagsp) /* inode logging flags */ +{ + xfs_btree_cur_t *cur; /* btree cursor */ + xfs_bmbt_rec_host_t *ep; /* extent entry for idx */ + int error; /* error return value */ + int i; /* temp state */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_fileoff_t new_endoff; /* end offset of new entry */ + xfs_exntst_t newext; /* new extent state */ + xfs_exntst_t oldext; /* old extent state */ + xfs_bmbt_irec_t r[3]; /* neighbor extent entries */ + /* left is 0, right is 1, prev is 2 */ + int rval=0; /* return value (logging flags) */ + int state = 0;/* state bits, accessed thru macros */ + struct xfs_mount *mp = tp->t_mountp; + + *logflagsp = 0; + + cur = *curp; + ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + + ASSERT(*idx >= 0); + ASSERT(*idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec)); + ASSERT(!isnullstartblock(new->br_startblock)); + + XFS_STATS_INC(xs_add_exlist); + +#define LEFT r[0] +#define RIGHT r[1] +#define PREV r[2] + + /* + * Set up a bunch of variables to make the tests simpler. + */ + error = 0; + ep = xfs_iext_get_ext(ifp, *idx); + xfs_bmbt_get_all(ep, &PREV); + newext = new->br_state; + oldext = (newext == XFS_EXT_UNWRITTEN) ? + XFS_EXT_NORM : XFS_EXT_UNWRITTEN; + ASSERT(PREV.br_state == oldext); + new_endoff = new->br_startoff + new->br_blockcount; + ASSERT(PREV.br_startoff <= new->br_startoff); + ASSERT(PREV.br_startoff + PREV.br_blockcount >= new_endoff); + + /* + * Set flags determining what part of the previous oldext allocation + * extent is being replaced by a newext allocation. + */ + if (PREV.br_startoff == new->br_startoff) + state |= BMAP_LEFT_FILLING; + if (PREV.br_startoff + PREV.br_blockcount == new_endoff) + state |= BMAP_RIGHT_FILLING; + + /* + * Check and set flags if this segment has a left neighbor. + * Don't set contiguous if the combined extent would be too large. + */ + if (*idx > 0) { + state |= BMAP_LEFT_VALID; + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &LEFT); + + if (isnullstartblock(LEFT.br_startblock)) + state |= BMAP_LEFT_DELAY; + } + + if ((state & BMAP_LEFT_VALID) && !(state & BMAP_LEFT_DELAY) && + LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff && + LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock && + LEFT.br_state == newext && + LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN) + state |= BMAP_LEFT_CONTIG; + + /* + * Check and set flags if this segment has a right neighbor. + * Don't set contiguous if the combined extent would be too large. + * Also check for all-three-contiguous being too large. + */ + if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) { + state |= BMAP_RIGHT_VALID; + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx + 1), &RIGHT); + if (isnullstartblock(RIGHT.br_startblock)) + state |= BMAP_RIGHT_DELAY; + } + + if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) && + new_endoff == RIGHT.br_startoff && + new->br_startblock + new->br_blockcount == RIGHT.br_startblock && + newext == RIGHT.br_state && + new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN && + ((state & (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING | + BMAP_RIGHT_FILLING)) != + (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING | + BMAP_RIGHT_FILLING) || + LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount + <= MAXEXTLEN)) + state |= BMAP_RIGHT_CONTIG; + + /* + * Switch out based on the FILLING and CONTIG state bits. + */ + switch (state & (BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | + BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG)) { + case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | + BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG: + /* + * Setting all of a previous oldext extent to newext. + * The left and right neighbors are both contiguous with new. + */ + --*idx; + + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), + LEFT.br_blockcount + PREV.br_blockcount + + RIGHT.br_blockcount); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + + xfs_iext_remove(ip, *idx + 1, 2, state); + ip->i_d.di_nextents -= 2; + if (cur == NULL) + rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; + else { + rval = XFS_ILOG_CORE; + if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff, + RIGHT.br_startblock, + RIGHT.br_blockcount, &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if ((error = xfs_btree_delete(cur, &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if ((error = xfs_btree_decrement(cur, 0, &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if ((error = xfs_btree_delete(cur, &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if ((error = xfs_btree_decrement(cur, 0, &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if ((error = xfs_bmbt_update(cur, LEFT.br_startoff, + LEFT.br_startblock, + LEFT.br_blockcount + PREV.br_blockcount + + RIGHT.br_blockcount, LEFT.br_state))) + goto done; + } + break; + + case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG: + /* + * Setting all of a previous oldext extent to newext. + * The left neighbor is contiguous, the right is not. + */ + --*idx; + + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), + LEFT.br_blockcount + PREV.br_blockcount); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + + xfs_iext_remove(ip, *idx + 1, 1, state); + ip->i_d.di_nextents--; + if (cur == NULL) + rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; + else { + rval = XFS_ILOG_CORE; + if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff, + PREV.br_startblock, PREV.br_blockcount, + &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if ((error = xfs_btree_delete(cur, &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if ((error = xfs_btree_decrement(cur, 0, &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if ((error = xfs_bmbt_update(cur, LEFT.br_startoff, + LEFT.br_startblock, + LEFT.br_blockcount + PREV.br_blockcount, + LEFT.br_state))) + goto done; + } + break; + + case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG: + /* + * Setting all of a previous oldext extent to newext. + * The right neighbor is contiguous, the left is not. + */ + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(ep, + PREV.br_blockcount + RIGHT.br_blockcount); + xfs_bmbt_set_state(ep, newext); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + xfs_iext_remove(ip, *idx + 1, 1, state); + ip->i_d.di_nextents--; + if (cur == NULL) + rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; + else { + rval = XFS_ILOG_CORE; + if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff, + RIGHT.br_startblock, + RIGHT.br_blockcount, &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if ((error = xfs_btree_delete(cur, &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if ((error = xfs_btree_decrement(cur, 0, &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if ((error = xfs_bmbt_update(cur, new->br_startoff, + new->br_startblock, + new->br_blockcount + RIGHT.br_blockcount, + newext))) + goto done; + } + break; + + case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING: + /* + * Setting all of a previous oldext extent to newext. + * Neither the left nor right neighbors are contiguous with + * the new one. + */ + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_state(ep, newext); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + + if (cur == NULL) + rval = XFS_ILOG_DEXT; + else { + rval = 0; + if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff, + new->br_startblock, new->br_blockcount, + &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if ((error = xfs_bmbt_update(cur, new->br_startoff, + new->br_startblock, new->br_blockcount, + newext))) + goto done; + } + break; + + case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG: + /* + * Setting the first part of a previous oldext extent to newext. + * The left neighbor is contiguous. + */ + trace_xfs_bmap_pre_update(ip, *idx - 1, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx - 1), + LEFT.br_blockcount + new->br_blockcount); + xfs_bmbt_set_startoff(ep, + PREV.br_startoff + new->br_blockcount); + trace_xfs_bmap_post_update(ip, *idx - 1, state, _THIS_IP_); + + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_startblock(ep, + new->br_startblock + new->br_blockcount); + xfs_bmbt_set_blockcount(ep, + PREV.br_blockcount - new->br_blockcount); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + + --*idx; + + if (cur == NULL) + rval = XFS_ILOG_DEXT; + else { + rval = 0; + if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff, + PREV.br_startblock, PREV.br_blockcount, + &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if ((error = xfs_bmbt_update(cur, + PREV.br_startoff + new->br_blockcount, + PREV.br_startblock + new->br_blockcount, + PREV.br_blockcount - new->br_blockcount, + oldext))) + goto done; + if ((error = xfs_btree_decrement(cur, 0, &i))) + goto done; + error = xfs_bmbt_update(cur, LEFT.br_startoff, + LEFT.br_startblock, + LEFT.br_blockcount + new->br_blockcount, + LEFT.br_state); + if (error) + goto done; + } + break; + + case BMAP_LEFT_FILLING: + /* + * Setting the first part of a previous oldext extent to newext. + * The left neighbor is not contiguous. + */ + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + ASSERT(ep && xfs_bmbt_get_state(ep) == oldext); + xfs_bmbt_set_startoff(ep, new_endoff); + xfs_bmbt_set_blockcount(ep, + PREV.br_blockcount - new->br_blockcount); + xfs_bmbt_set_startblock(ep, + new->br_startblock + new->br_blockcount); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + + xfs_iext_insert(ip, *idx, 1, new, state); + ip->i_d.di_nextents++; + if (cur == NULL) + rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; + else { + rval = XFS_ILOG_CORE; + if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff, + PREV.br_startblock, PREV.br_blockcount, + &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if ((error = xfs_bmbt_update(cur, + PREV.br_startoff + new->br_blockcount, + PREV.br_startblock + new->br_blockcount, + PREV.br_blockcount - new->br_blockcount, + oldext))) + goto done; + cur->bc_rec.b = *new; + if ((error = xfs_btree_insert(cur, &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + } + break; + + case BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG: + /* + * Setting the last part of a previous oldext extent to newext. + * The right neighbor is contiguous with the new allocation. + */ + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(ep, + PREV.br_blockcount - new->br_blockcount); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + + ++*idx; + + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx), + new->br_startoff, new->br_startblock, + new->br_blockcount + RIGHT.br_blockcount, newext); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + + if (cur == NULL) + rval = XFS_ILOG_DEXT; + else { + rval = 0; + if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff, + PREV.br_startblock, + PREV.br_blockcount, &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if ((error = xfs_bmbt_update(cur, PREV.br_startoff, + PREV.br_startblock, + PREV.br_blockcount - new->br_blockcount, + oldext))) + goto done; + if ((error = xfs_btree_increment(cur, 0, &i))) + goto done; + if ((error = xfs_bmbt_update(cur, new->br_startoff, + new->br_startblock, + new->br_blockcount + RIGHT.br_blockcount, + newext))) + goto done; + } + break; + + case BMAP_RIGHT_FILLING: + /* + * Setting the last part of a previous oldext extent to newext. + * The right neighbor is not contiguous. + */ + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(ep, + PREV.br_blockcount - new->br_blockcount); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + + ++*idx; + xfs_iext_insert(ip, *idx, 1, new, state); + + ip->i_d.di_nextents++; + if (cur == NULL) + rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; + else { + rval = XFS_ILOG_CORE; + if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff, + PREV.br_startblock, PREV.br_blockcount, + &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + if ((error = xfs_bmbt_update(cur, PREV.br_startoff, + PREV.br_startblock, + PREV.br_blockcount - new->br_blockcount, + oldext))) + goto done; + if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff, + new->br_startblock, new->br_blockcount, + &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done); + cur->bc_rec.b.br_state = XFS_EXT_NORM; + if ((error = xfs_btree_insert(cur, &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + } + break; + + case 0: + /* + * Setting the middle part of a previous oldext extent to + * newext. Contiguity is impossible here. + * One extent becomes three extents. + */ + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(ep, + new->br_startoff - PREV.br_startoff); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + + r[0] = *new; + r[1].br_startoff = new_endoff; + r[1].br_blockcount = + PREV.br_startoff + PREV.br_blockcount - new_endoff; + r[1].br_startblock = new->br_startblock + new->br_blockcount; + r[1].br_state = oldext; + + ++*idx; + xfs_iext_insert(ip, *idx, 2, &r[0], state); + + ip->i_d.di_nextents += 2; + if (cur == NULL) + rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; + else { + rval = XFS_ILOG_CORE; + if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff, + PREV.br_startblock, PREV.br_blockcount, + &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + /* new right extent - oldext */ + if ((error = xfs_bmbt_update(cur, r[1].br_startoff, + r[1].br_startblock, r[1].br_blockcount, + r[1].br_state))) + goto done; + /* new left extent - oldext */ + cur->bc_rec.b = PREV; + cur->bc_rec.b.br_blockcount = + new->br_startoff - PREV.br_startoff; + if ((error = xfs_btree_insert(cur, &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + /* + * Reset the cursor to the position of the new extent + * we are about to insert as we can't trust it after + * the previous insert. + */ + if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff, + new->br_startblock, new->br_blockcount, + &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done); + /* new middle extent - newext */ + cur->bc_rec.b.br_state = new->br_state; + if ((error = xfs_btree_insert(cur, &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + } + break; + + case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: + case BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: + case BMAP_LEFT_FILLING | BMAP_RIGHT_CONTIG: + case BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG: + case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: + case BMAP_LEFT_CONTIG: + case BMAP_RIGHT_CONTIG: + /* + * These cases are all impossible. + */ + ASSERT(0); + } + + /* convert to a btree if necessary */ + if (xfs_bmap_needs_btree(ip, XFS_DATA_FORK)) { + int tmp_logflags; /* partial log flag return val */ + + ASSERT(cur == NULL); + error = xfs_bmap_extents_to_btree(tp, ip, first, flist, &cur, + 0, &tmp_logflags, XFS_DATA_FORK); + *logflagsp |= tmp_logflags; + if (error) + goto done; + } + + /* clear out the allocated field, done with it now in any case. */ + if (cur) { + cur->bc_private.b.allocated = 0; + *curp = cur; + } + + xfs_bmap_check_leaf_extents(*curp, ip, XFS_DATA_FORK); +done: + *logflagsp |= rval; + return error; +#undef LEFT +#undef RIGHT +#undef PREV +} + +/* + * Convert a hole to a delayed allocation. + */ +STATIC void +xfs_bmap_add_extent_hole_delay( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_extnum_t *idx, /* extent number to update/insert */ + xfs_bmbt_irec_t *new) /* new data to add to file extents */ +{ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_bmbt_irec_t left; /* left neighbor extent entry */ + xfs_filblks_t newlen=0; /* new indirect size */ + xfs_filblks_t oldlen=0; /* old indirect size */ + xfs_bmbt_irec_t right; /* right neighbor extent entry */ + int state; /* state bits, accessed thru macros */ + xfs_filblks_t temp=0; /* temp for indirect calculations */ + + ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + state = 0; + ASSERT(isnullstartblock(new->br_startblock)); + + /* + * Check and set flags if this segment has a left neighbor + */ + if (*idx > 0) { + state |= BMAP_LEFT_VALID; + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &left); + + if (isnullstartblock(left.br_startblock)) + state |= BMAP_LEFT_DELAY; + } + + /* + * Check and set flags if the current (right) segment exists. + * If it doesn't exist, we're converting the hole at end-of-file. + */ + if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) { + state |= BMAP_RIGHT_VALID; + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &right); + + if (isnullstartblock(right.br_startblock)) + state |= BMAP_RIGHT_DELAY; + } + + /* + * Set contiguity flags on the left and right neighbors. + * Don't let extents get too large, even if the pieces are contiguous. + */ + if ((state & BMAP_LEFT_VALID) && (state & BMAP_LEFT_DELAY) && + left.br_startoff + left.br_blockcount == new->br_startoff && + left.br_blockcount + new->br_blockcount <= MAXEXTLEN) + state |= BMAP_LEFT_CONTIG; + + if ((state & BMAP_RIGHT_VALID) && (state & BMAP_RIGHT_DELAY) && + new->br_startoff + new->br_blockcount == right.br_startoff && + new->br_blockcount + right.br_blockcount <= MAXEXTLEN && + (!(state & BMAP_LEFT_CONTIG) || + (left.br_blockcount + new->br_blockcount + + right.br_blockcount <= MAXEXTLEN))) + state |= BMAP_RIGHT_CONTIG; + + /* + * Switch out based on the contiguity flags. + */ + switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) { + case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: + /* + * New allocation is contiguous with delayed allocations + * on the left and on the right. + * Merge all three into a single extent record. + */ + --*idx; + temp = left.br_blockcount + new->br_blockcount + + right.br_blockcount; + + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), temp); + oldlen = startblockval(left.br_startblock) + + startblockval(new->br_startblock) + + startblockval(right.br_startblock); + newlen = xfs_bmap_worst_indlen(ip, temp); + xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx), + nullstartblock((int)newlen)); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + + xfs_iext_remove(ip, *idx + 1, 1, state); + break; + + case BMAP_LEFT_CONTIG: + /* + * New allocation is contiguous with a delayed allocation + * on the left. + * Merge the new allocation with the left neighbor. + */ + --*idx; + temp = left.br_blockcount + new->br_blockcount; + + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), temp); + oldlen = startblockval(left.br_startblock) + + startblockval(new->br_startblock); + newlen = xfs_bmap_worst_indlen(ip, temp); + xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx), + nullstartblock((int)newlen)); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + break; + + case BMAP_RIGHT_CONTIG: + /* + * New allocation is contiguous with a delayed allocation + * on the right. + * Merge the new allocation with the right neighbor. + */ + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + temp = new->br_blockcount + right.br_blockcount; + oldlen = startblockval(new->br_startblock) + + startblockval(right.br_startblock); + newlen = xfs_bmap_worst_indlen(ip, temp); + xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx), + new->br_startoff, + nullstartblock((int)newlen), temp, right.br_state); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + break; + + case 0: + /* + * New allocation is not contiguous with another + * delayed allocation. + * Insert a new entry. + */ + oldlen = newlen = 0; + xfs_iext_insert(ip, *idx, 1, new, state); + break; + } + if (oldlen != newlen) { + ASSERT(oldlen > newlen); + xfs_mod_fdblocks(ip->i_mount, (int64_t)(oldlen - newlen), + false); + /* + * Nothing to do for disk quota accounting here. + */ + } +} + +/* + * Convert a hole to a real allocation. + */ +STATIC int /* error */ +xfs_bmap_add_extent_hole_real( + struct xfs_bmalloca *bma, + int whichfork) +{ + struct xfs_bmbt_irec *new = &bma->got; + int error; /* error return value */ + int i; /* temp state */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_bmbt_irec_t left; /* left neighbor extent entry */ + xfs_bmbt_irec_t right; /* right neighbor extent entry */ + int rval=0; /* return value (logging flags) */ + int state; /* state bits, accessed thru macros */ + struct xfs_mount *mp; + + mp = bma->tp ? bma->tp->t_mountp : NULL; + ifp = XFS_IFORK_PTR(bma->ip, whichfork); + + ASSERT(bma->idx >= 0); + ASSERT(bma->idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec)); + ASSERT(!isnullstartblock(new->br_startblock)); + ASSERT(!bma->cur || + !(bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL)); + + XFS_STATS_INC(xs_add_exlist); + + state = 0; + if (whichfork == XFS_ATTR_FORK) + state |= BMAP_ATTRFORK; + + /* + * Check and set flags if this segment has a left neighbor. + */ + if (bma->idx > 0) { + state |= BMAP_LEFT_VALID; + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1), &left); + if (isnullstartblock(left.br_startblock)) + state |= BMAP_LEFT_DELAY; + } + + /* + * Check and set flags if this segment has a current value. + * Not true if we're inserting into the "hole" at eof. + */ + if (bma->idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) { + state |= BMAP_RIGHT_VALID; + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &right); + if (isnullstartblock(right.br_startblock)) + state |= BMAP_RIGHT_DELAY; + } + + /* + * We're inserting a real allocation between "left" and "right". + * Set the contiguity flags. Don't let extents get too large. + */ + if ((state & BMAP_LEFT_VALID) && !(state & BMAP_LEFT_DELAY) && + left.br_startoff + left.br_blockcount == new->br_startoff && + left.br_startblock + left.br_blockcount == new->br_startblock && + left.br_state == new->br_state && + left.br_blockcount + new->br_blockcount <= MAXEXTLEN) + state |= BMAP_LEFT_CONTIG; + + if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) && + new->br_startoff + new->br_blockcount == right.br_startoff && + new->br_startblock + new->br_blockcount == right.br_startblock && + new->br_state == right.br_state && + new->br_blockcount + right.br_blockcount <= MAXEXTLEN && + (!(state & BMAP_LEFT_CONTIG) || + left.br_blockcount + new->br_blockcount + + right.br_blockcount <= MAXEXTLEN)) + state |= BMAP_RIGHT_CONTIG; + + error = 0; + /* + * Select which case we're in here, and implement it. + */ + switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) { + case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: + /* + * New allocation is contiguous with real allocations on the + * left and on the right. + * Merge all three into a single extent record. + */ + --bma->idx; + trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx), + left.br_blockcount + new->br_blockcount + + right.br_blockcount); + trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); + + xfs_iext_remove(bma->ip, bma->idx + 1, 1, state); + + XFS_IFORK_NEXT_SET(bma->ip, whichfork, + XFS_IFORK_NEXTENTS(bma->ip, whichfork) - 1); + if (bma->cur == NULL) { + rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork); + } else { + rval = XFS_ILOG_CORE; + error = xfs_bmbt_lookup_eq(bma->cur, right.br_startoff, + right.br_startblock, right.br_blockcount, + &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + error = xfs_btree_delete(bma->cur, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + error = xfs_btree_decrement(bma->cur, 0, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + error = xfs_bmbt_update(bma->cur, left.br_startoff, + left.br_startblock, + left.br_blockcount + + new->br_blockcount + + right.br_blockcount, + left.br_state); + if (error) + goto done; + } + break; + + case BMAP_LEFT_CONTIG: + /* + * New allocation is contiguous with a real allocation + * on the left. + * Merge the new allocation with the left neighbor. + */ + --bma->idx; + trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx), + left.br_blockcount + new->br_blockcount); + trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); + + if (bma->cur == NULL) { + rval = xfs_ilog_fext(whichfork); + } else { + rval = 0; + error = xfs_bmbt_lookup_eq(bma->cur, left.br_startoff, + left.br_startblock, left.br_blockcount, + &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + error = xfs_bmbt_update(bma->cur, left.br_startoff, + left.br_startblock, + left.br_blockcount + + new->br_blockcount, + left.br_state); + if (error) + goto done; + } + break; + + case BMAP_RIGHT_CONTIG: + /* + * New allocation is contiguous with a real allocation + * on the right. + * Merge the new allocation with the right neighbor. + */ + trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); + xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, bma->idx), + new->br_startoff, new->br_startblock, + new->br_blockcount + right.br_blockcount, + right.br_state); + trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); + + if (bma->cur == NULL) { + rval = xfs_ilog_fext(whichfork); + } else { + rval = 0; + error = xfs_bmbt_lookup_eq(bma->cur, + right.br_startoff, + right.br_startblock, + right.br_blockcount, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + error = xfs_bmbt_update(bma->cur, new->br_startoff, + new->br_startblock, + new->br_blockcount + + right.br_blockcount, + right.br_state); + if (error) + goto done; + } + break; + + case 0: + /* + * New allocation is not contiguous with another + * real allocation. + * Insert a new entry. + */ + xfs_iext_insert(bma->ip, bma->idx, 1, new, state); + XFS_IFORK_NEXT_SET(bma->ip, whichfork, + XFS_IFORK_NEXTENTS(bma->ip, whichfork) + 1); + if (bma->cur == NULL) { + rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork); + } else { + rval = XFS_ILOG_CORE; + error = xfs_bmbt_lookup_eq(bma->cur, + new->br_startoff, + new->br_startblock, + new->br_blockcount, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done); + bma->cur->bc_rec.b.br_state = new->br_state; + error = xfs_btree_insert(bma->cur, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + } + break; + } + + /* convert to a btree if necessary */ + if (xfs_bmap_needs_btree(bma->ip, whichfork)) { + int tmp_logflags; /* partial log flag return val */ + + ASSERT(bma->cur == NULL); + error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, + bma->firstblock, bma->flist, &bma->cur, + 0, &tmp_logflags, whichfork); + bma->logflags |= tmp_logflags; + if (error) + goto done; + } + + /* clear out the allocated field, done with it now in any case. */ + if (bma->cur) + bma->cur->bc_private.b.allocated = 0; + + xfs_bmap_check_leaf_extents(bma->cur, bma->ip, whichfork); +done: + bma->logflags |= rval; + return error; +} + +/* + * Functions used in the extent read, allocate and remove paths + */ + +/* + * Adjust the size of the new extent based on di_extsize and rt extsize. + */ +int +xfs_bmap_extsize_align( + xfs_mount_t *mp, + xfs_bmbt_irec_t *gotp, /* next extent pointer */ + xfs_bmbt_irec_t *prevp, /* previous extent pointer */ + xfs_extlen_t extsz, /* align to this extent size */ + int rt, /* is this a realtime inode? */ + int eof, /* is extent at end-of-file? */ + int delay, /* creating delalloc extent? */ + int convert, /* overwriting unwritten extent? */ + xfs_fileoff_t *offp, /* in/out: aligned offset */ + xfs_extlen_t *lenp) /* in/out: aligned length */ +{ + xfs_fileoff_t orig_off; /* original offset */ + xfs_extlen_t orig_alen; /* original length */ + xfs_fileoff_t orig_end; /* original off+len */ + xfs_fileoff_t nexto; /* next file offset */ + xfs_fileoff_t prevo; /* previous file offset */ + xfs_fileoff_t align_off; /* temp for offset */ + xfs_extlen_t align_alen; /* temp for length */ + xfs_extlen_t temp; /* temp for calculations */ + + if (convert) + return 0; + + orig_off = align_off = *offp; + orig_alen = align_alen = *lenp; + orig_end = orig_off + orig_alen; + + /* + * If this request overlaps an existing extent, then don't + * attempt to perform any additional alignment. + */ + if (!delay && !eof && + (orig_off >= gotp->br_startoff) && + (orig_end <= gotp->br_startoff + gotp->br_blockcount)) { + return 0; + } + + /* + * If the file offset is unaligned vs. the extent size + * we need to align it. This will be possible unless + * the file was previously written with a kernel that didn't + * perform this alignment, or if a truncate shot us in the + * foot. + */ + temp = do_mod(orig_off, extsz); + if (temp) { + align_alen += temp; + align_off -= temp; + } + + /* Same adjustment for the end of the requested area. */ + temp = (align_alen % extsz); + if (temp) + align_alen += extsz - temp; + + /* + * For large extent hint sizes, the aligned extent might be larger than + * MAXEXTLEN. In that case, reduce the size by an extsz so that it pulls + * the length back under MAXEXTLEN. The outer allocation loops handle + * short allocation just fine, so it is safe to do this. We only want to + * do it when we are forced to, though, because it means more allocation + * operations are required. + */ + while (align_alen > MAXEXTLEN) + align_alen -= extsz; + ASSERT(align_alen <= MAXEXTLEN); + + /* + * If the previous block overlaps with this proposed allocation + * then move the start forward without adjusting the length. + */ + if (prevp->br_startoff != NULLFILEOFF) { + if (prevp->br_startblock == HOLESTARTBLOCK) + prevo = prevp->br_startoff; + else + prevo = prevp->br_startoff + prevp->br_blockcount; + } else + prevo = 0; + if (align_off != orig_off && align_off < prevo) + align_off = prevo; + /* + * If the next block overlaps with this proposed allocation + * then move the start back without adjusting the length, + * but not before offset 0. + * This may of course make the start overlap previous block, + * and if we hit the offset 0 limit then the next block + * can still overlap too. + */ + if (!eof && gotp->br_startoff != NULLFILEOFF) { + if ((delay && gotp->br_startblock == HOLESTARTBLOCK) || + (!delay && gotp->br_startblock == DELAYSTARTBLOCK)) + nexto = gotp->br_startoff + gotp->br_blockcount; + else + nexto = gotp->br_startoff; + } else + nexto = NULLFILEOFF; + if (!eof && + align_off + align_alen != orig_end && + align_off + align_alen > nexto) + align_off = nexto > align_alen ? nexto - align_alen : 0; + /* + * If we're now overlapping the next or previous extent that + * means we can't fit an extsz piece in this hole. Just move + * the start forward to the first valid spot and set + * the length so we hit the end. + */ + if (align_off != orig_off && align_off < prevo) + align_off = prevo; + if (align_off + align_alen != orig_end && + align_off + align_alen > nexto && + nexto != NULLFILEOFF) { + ASSERT(nexto > prevo); + align_alen = nexto - align_off; + } + + /* + * If realtime, and the result isn't a multiple of the realtime + * extent size we need to remove blocks until it is. + */ + if (rt && (temp = (align_alen % mp->m_sb.sb_rextsize))) { + /* + * We're not covering the original request, or + * we won't be able to once we fix the length. + */ + if (orig_off < align_off || + orig_end > align_off + align_alen || + align_alen - temp < orig_alen) + return -EINVAL; + /* + * Try to fix it by moving the start up. + */ + if (align_off + temp <= orig_off) { + align_alen -= temp; + align_off += temp; + } + /* + * Try to fix it by moving the end in. + */ + else if (align_off + align_alen - temp >= orig_end) + align_alen -= temp; + /* + * Set the start to the minimum then trim the length. + */ + else { + align_alen -= orig_off - align_off; + align_off = orig_off; + align_alen -= align_alen % mp->m_sb.sb_rextsize; + } + /* + * Result doesn't cover the request, fail it. + */ + if (orig_off < align_off || orig_end > align_off + align_alen) + return -EINVAL; + } else { + ASSERT(orig_off >= align_off); + /* see MAXEXTLEN handling above */ + ASSERT(orig_end <= align_off + align_alen || + align_alen + extsz > MAXEXTLEN); + } + +#ifdef DEBUG + if (!eof && gotp->br_startoff != NULLFILEOFF) + ASSERT(align_off + align_alen <= gotp->br_startoff); + if (prevp->br_startoff != NULLFILEOFF) + ASSERT(align_off >= prevp->br_startoff + prevp->br_blockcount); +#endif + + *lenp = align_alen; + *offp = align_off; + return 0; +} + +#define XFS_ALLOC_GAP_UNITS 4 + +void +xfs_bmap_adjacent( + struct xfs_bmalloca *ap) /* bmap alloc argument struct */ +{ + xfs_fsblock_t adjust; /* adjustment to block numbers */ + xfs_agnumber_t fb_agno; /* ag number of ap->firstblock */ + xfs_mount_t *mp; /* mount point structure */ + int nullfb; /* true if ap->firstblock isn't set */ + int rt; /* true if inode is realtime */ + +#define ISVALID(x,y) \ + (rt ? \ + (x) < mp->m_sb.sb_rblocks : \ + XFS_FSB_TO_AGNO(mp, x) == XFS_FSB_TO_AGNO(mp, y) && \ + XFS_FSB_TO_AGNO(mp, x) < mp->m_sb.sb_agcount && \ + XFS_FSB_TO_AGBNO(mp, x) < mp->m_sb.sb_agblocks) + + mp = ap->ip->i_mount; + nullfb = *ap->firstblock == NULLFSBLOCK; + rt = XFS_IS_REALTIME_INODE(ap->ip) && ap->userdata; + fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, *ap->firstblock); + /* + * If allocating at eof, and there's a previous real block, + * try to use its last block as our starting point. + */ + if (ap->eof && ap->prev.br_startoff != NULLFILEOFF && + !isnullstartblock(ap->prev.br_startblock) && + ISVALID(ap->prev.br_startblock + ap->prev.br_blockcount, + ap->prev.br_startblock)) { + ap->blkno = ap->prev.br_startblock + ap->prev.br_blockcount; + /* + * Adjust for the gap between prevp and us. + */ + adjust = ap->offset - + (ap->prev.br_startoff + ap->prev.br_blockcount); + if (adjust && + ISVALID(ap->blkno + adjust, ap->prev.br_startblock)) + ap->blkno += adjust; + } + /* + * If not at eof, then compare the two neighbor blocks. + * Figure out whether either one gives us a good starting point, + * and pick the better one. + */ + else if (!ap->eof) { + xfs_fsblock_t gotbno; /* right side block number */ + xfs_fsblock_t gotdiff=0; /* right side difference */ + xfs_fsblock_t prevbno; /* left side block number */ + xfs_fsblock_t prevdiff=0; /* left side difference */ + + /* + * If there's a previous (left) block, select a requested + * start block based on it. + */ + if (ap->prev.br_startoff != NULLFILEOFF && + !isnullstartblock(ap->prev.br_startblock) && + (prevbno = ap->prev.br_startblock + + ap->prev.br_blockcount) && + ISVALID(prevbno, ap->prev.br_startblock)) { + /* + * Calculate gap to end of previous block. + */ + adjust = prevdiff = ap->offset - + (ap->prev.br_startoff + + ap->prev.br_blockcount); + /* + * Figure the startblock based on the previous block's + * end and the gap size. + * Heuristic! + * If the gap is large relative to the piece we're + * allocating, or using it gives us an invalid block + * number, then just use the end of the previous block. + */ + if (prevdiff <= XFS_ALLOC_GAP_UNITS * ap->length && + ISVALID(prevbno + prevdiff, + ap->prev.br_startblock)) + prevbno += adjust; + else + prevdiff += adjust; + /* + * If the firstblock forbids it, can't use it, + * must use default. + */ + if (!rt && !nullfb && + XFS_FSB_TO_AGNO(mp, prevbno) != fb_agno) + prevbno = NULLFSBLOCK; + } + /* + * No previous block or can't follow it, just default. + */ + else + prevbno = NULLFSBLOCK; + /* + * If there's a following (right) block, select a requested + * start block based on it. + */ + if (!isnullstartblock(ap->got.br_startblock)) { + /* + * Calculate gap to start of next block. + */ + adjust = gotdiff = ap->got.br_startoff - ap->offset; + /* + * Figure the startblock based on the next block's + * start and the gap size. + */ + gotbno = ap->got.br_startblock; + /* + * Heuristic! + * If the gap is large relative to the piece we're + * allocating, or using it gives us an invalid block + * number, then just use the start of the next block + * offset by our length. + */ + if (gotdiff <= XFS_ALLOC_GAP_UNITS * ap->length && + ISVALID(gotbno - gotdiff, gotbno)) + gotbno -= adjust; + else if (ISVALID(gotbno - ap->length, gotbno)) { + gotbno -= ap->length; + gotdiff += adjust - ap->length; + } else + gotdiff += adjust; + /* + * If the firstblock forbids it, can't use it, + * must use default. + */ + if (!rt && !nullfb && + XFS_FSB_TO_AGNO(mp, gotbno) != fb_agno) + gotbno = NULLFSBLOCK; + } + /* + * No next block, just default. + */ + else + gotbno = NULLFSBLOCK; + /* + * If both valid, pick the better one, else the only good + * one, else ap->blkno is already set (to 0 or the inode block). + */ + if (prevbno != NULLFSBLOCK && gotbno != NULLFSBLOCK) + ap->blkno = prevdiff <= gotdiff ? prevbno : gotbno; + else if (prevbno != NULLFSBLOCK) + ap->blkno = prevbno; + else if (gotbno != NULLFSBLOCK) + ap->blkno = gotbno; + } +#undef ISVALID +} + +static int +xfs_bmap_longest_free_extent( + struct xfs_trans *tp, + xfs_agnumber_t ag, + xfs_extlen_t *blen, + int *notinit) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_perag *pag; + xfs_extlen_t longest; + int error = 0; + + pag = xfs_perag_get(mp, ag); + if (!pag->pagf_init) { + error = xfs_alloc_pagf_init(mp, tp, ag, XFS_ALLOC_FLAG_TRYLOCK); + if (error) + goto out; + + if (!pag->pagf_init) { + *notinit = 1; + goto out; + } + } + + longest = xfs_alloc_longest_free_extent(mp, pag); + if (*blen < longest) + *blen = longest; + +out: + xfs_perag_put(pag); + return error; +} + +static void +xfs_bmap_select_minlen( + struct xfs_bmalloca *ap, + struct xfs_alloc_arg *args, + xfs_extlen_t *blen, + int notinit) +{ + if (notinit || *blen < ap->minlen) { + /* + * Since we did a BUF_TRYLOCK above, it is possible that + * there is space for this request. + */ + args->minlen = ap->minlen; + } else if (*blen < args->maxlen) { + /* + * If the best seen length is less than the request length, + * use the best as the minimum. + */ + args->minlen = *blen; + } else { + /* + * Otherwise we've seen an extent as big as maxlen, use that + * as the minimum. + */ + args->minlen = args->maxlen; + } +} + +STATIC int +xfs_bmap_btalloc_nullfb( + struct xfs_bmalloca *ap, + struct xfs_alloc_arg *args, + xfs_extlen_t *blen) +{ + struct xfs_mount *mp = ap->ip->i_mount; + xfs_agnumber_t ag, startag; + int notinit = 0; + int error; + + args->type = XFS_ALLOCTYPE_START_BNO; + args->total = ap->total; + + startag = ag = XFS_FSB_TO_AGNO(mp, args->fsbno); + if (startag == NULLAGNUMBER) + startag = ag = 0; + + while (*blen < args->maxlen) { + error = xfs_bmap_longest_free_extent(args->tp, ag, blen, + ¬init); + if (error) + return error; + + if (++ag == mp->m_sb.sb_agcount) + ag = 0; + if (ag == startag) + break; + } + + xfs_bmap_select_minlen(ap, args, blen, notinit); + return 0; +} + +STATIC int +xfs_bmap_btalloc_filestreams( + struct xfs_bmalloca *ap, + struct xfs_alloc_arg *args, + xfs_extlen_t *blen) +{ + struct xfs_mount *mp = ap->ip->i_mount; + xfs_agnumber_t ag; + int notinit = 0; + int error; + + args->type = XFS_ALLOCTYPE_NEAR_BNO; + args->total = ap->total; + + ag = XFS_FSB_TO_AGNO(mp, args->fsbno); + if (ag == NULLAGNUMBER) + ag = 0; + + error = xfs_bmap_longest_free_extent(args->tp, ag, blen, ¬init); + if (error) + return error; + + if (*blen < args->maxlen) { + error = xfs_filestream_new_ag(ap, &ag); + if (error) + return error; + + error = xfs_bmap_longest_free_extent(args->tp, ag, blen, + ¬init); + if (error) + return error; + + } + + xfs_bmap_select_minlen(ap, args, blen, notinit); + + /* + * Set the failure fallback case to look in the selected AG as stream + * may have moved. + */ + ap->blkno = args->fsbno = XFS_AGB_TO_FSB(mp, ag, 0); + return 0; +} + +STATIC int +xfs_bmap_btalloc( + struct xfs_bmalloca *ap) /* bmap alloc argument struct */ +{ + xfs_mount_t *mp; /* mount point structure */ + xfs_alloctype_t atype = 0; /* type for allocation routines */ + xfs_extlen_t align; /* minimum allocation alignment */ + xfs_agnumber_t fb_agno; /* ag number of ap->firstblock */ + xfs_agnumber_t ag; + xfs_alloc_arg_t args; + xfs_extlen_t blen; + xfs_extlen_t nextminlen = 0; + int nullfb; /* true if ap->firstblock isn't set */ + int isaligned; + int tryagain; + int error; + int stripe_align; + + ASSERT(ap->length); + + mp = ap->ip->i_mount; + + /* stripe alignment for allocation is determined by mount parameters */ + stripe_align = 0; + if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC)) + stripe_align = mp->m_swidth; + else if (mp->m_dalign) + stripe_align = mp->m_dalign; + + align = ap->userdata ? xfs_get_extsz_hint(ap->ip) : 0; + if (unlikely(align)) { + error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, + align, 0, ap->eof, 0, ap->conv, + &ap->offset, &ap->length); + ASSERT(!error); + ASSERT(ap->length); + } + + + nullfb = *ap->firstblock == NULLFSBLOCK; + fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, *ap->firstblock); + if (nullfb) { + if (ap->userdata && xfs_inode_is_filestream(ap->ip)) { + ag = xfs_filestream_lookup_ag(ap->ip); + ag = (ag != NULLAGNUMBER) ? ag : 0; + ap->blkno = XFS_AGB_TO_FSB(mp, ag, 0); + } else { + ap->blkno = XFS_INO_TO_FSB(mp, ap->ip->i_ino); + } + } else + ap->blkno = *ap->firstblock; + + xfs_bmap_adjacent(ap); + + /* + * If allowed, use ap->blkno; otherwise must use firstblock since + * it's in the right allocation group. + */ + if (nullfb || XFS_FSB_TO_AGNO(mp, ap->blkno) == fb_agno) + ; + else + ap->blkno = *ap->firstblock; + /* + * Normal allocation, done through xfs_alloc_vextent. + */ + tryagain = isaligned = 0; + memset(&args, 0, sizeof(args)); + args.tp = ap->tp; + args.mp = mp; + args.fsbno = ap->blkno; + + /* Trim the allocation back to the maximum an AG can fit. */ + args.maxlen = MIN(ap->length, XFS_ALLOC_AG_MAX_USABLE(mp)); + args.firstblock = *ap->firstblock; + blen = 0; + if (nullfb) { + /* + * Search for an allocation group with a single extent large + * enough for the request. If one isn't found, then adjust + * the minimum allocation size to the largest space found. + */ + if (ap->userdata && xfs_inode_is_filestream(ap->ip)) + error = xfs_bmap_btalloc_filestreams(ap, &args, &blen); + else + error = xfs_bmap_btalloc_nullfb(ap, &args, &blen); + if (error) + return error; + } else if (ap->flist->xbf_low) { + if (xfs_inode_is_filestream(ap->ip)) + args.type = XFS_ALLOCTYPE_FIRST_AG; + else + args.type = XFS_ALLOCTYPE_START_BNO; + args.total = args.minlen = ap->minlen; + } else { + args.type = XFS_ALLOCTYPE_NEAR_BNO; + args.total = ap->total; + args.minlen = ap->minlen; + } + /* apply extent size hints if obtained earlier */ + if (unlikely(align)) { + args.prod = align; + if ((args.mod = (xfs_extlen_t)do_mod(ap->offset, args.prod))) + args.mod = (xfs_extlen_t)(args.prod - args.mod); + } else if (mp->m_sb.sb_blocksize >= PAGE_CACHE_SIZE) { + args.prod = 1; + args.mod = 0; + } else { + args.prod = PAGE_CACHE_SIZE >> mp->m_sb.sb_blocklog; + if ((args.mod = (xfs_extlen_t)(do_mod(ap->offset, args.prod)))) + args.mod = (xfs_extlen_t)(args.prod - args.mod); + } + /* + * If we are not low on available data blocks, and the + * underlying logical volume manager is a stripe, and + * the file offset is zero then try to allocate data + * blocks on stripe unit boundary. + * NOTE: ap->aeof is only set if the allocation length + * is >= the stripe unit and the allocation offset is + * at the end of file. + */ + if (!ap->flist->xbf_low && ap->aeof) { + if (!ap->offset) { + args.alignment = stripe_align; + atype = args.type; + isaligned = 1; + /* + * Adjust for alignment + */ + if (blen > args.alignment && blen <= args.maxlen) + args.minlen = blen - args.alignment; + args.minalignslop = 0; + } else { + /* + * First try an exact bno allocation. + * If it fails then do a near or start bno + * allocation with alignment turned on. + */ + atype = args.type; + tryagain = 1; + args.type = XFS_ALLOCTYPE_THIS_BNO; + args.alignment = 1; + /* + * Compute the minlen+alignment for the + * next case. Set slop so that the value + * of minlen+alignment+slop doesn't go up + * between the calls. + */ + if (blen > stripe_align && blen <= args.maxlen) + nextminlen = blen - stripe_align; + else + nextminlen = args.minlen; + if (nextminlen + stripe_align > args.minlen + 1) + args.minalignslop = + nextminlen + stripe_align - + args.minlen - 1; + else + args.minalignslop = 0; + } + } else { + args.alignment = 1; + args.minalignslop = 0; + } + args.minleft = ap->minleft; + args.wasdel = ap->wasdel; + args.isfl = 0; + args.userdata = ap->userdata; + if ((error = xfs_alloc_vextent(&args))) + return error; + if (tryagain && args.fsbno == NULLFSBLOCK) { + /* + * Exact allocation failed. Now try with alignment + * turned on. + */ + args.type = atype; + args.fsbno = ap->blkno; + args.alignment = stripe_align; + args.minlen = nextminlen; + args.minalignslop = 0; + isaligned = 1; + if ((error = xfs_alloc_vextent(&args))) + return error; + } + if (isaligned && args.fsbno == NULLFSBLOCK) { + /* + * allocation failed, so turn off alignment and + * try again. + */ + args.type = atype; + args.fsbno = ap->blkno; + args.alignment = 0; + if ((error = xfs_alloc_vextent(&args))) + return error; + } + if (args.fsbno == NULLFSBLOCK && nullfb && + args.minlen > ap->minlen) { + args.minlen = ap->minlen; + args.type = XFS_ALLOCTYPE_START_BNO; + args.fsbno = ap->blkno; + if ((error = xfs_alloc_vextent(&args))) + return error; + } + if (args.fsbno == NULLFSBLOCK && nullfb) { + args.fsbno = 0; + args.type = XFS_ALLOCTYPE_FIRST_AG; + args.total = ap->minlen; + args.minleft = 0; + if ((error = xfs_alloc_vextent(&args))) + return error; + ap->flist->xbf_low = 1; + } + if (args.fsbno != NULLFSBLOCK) { + /* + * check the allocation happened at the same or higher AG than + * the first block that was allocated. + */ + ASSERT(*ap->firstblock == NULLFSBLOCK || + XFS_FSB_TO_AGNO(mp, *ap->firstblock) == + XFS_FSB_TO_AGNO(mp, args.fsbno) || + (ap->flist->xbf_low && + XFS_FSB_TO_AGNO(mp, *ap->firstblock) < + XFS_FSB_TO_AGNO(mp, args.fsbno))); + + ap->blkno = args.fsbno; + if (*ap->firstblock == NULLFSBLOCK) + *ap->firstblock = args.fsbno; + ASSERT(nullfb || fb_agno == args.agno || + (ap->flist->xbf_low && fb_agno < args.agno)); + ap->length = args.len; + ap->ip->i_d.di_nblocks += args.len; + xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE); + if (ap->wasdel) + ap->ip->i_delayed_blks -= args.len; + /* + * Adjust the disk quota also. This was reserved + * earlier. + */ + xfs_trans_mod_dquot_byino(ap->tp, ap->ip, + ap->wasdel ? XFS_TRANS_DQ_DELBCOUNT : + XFS_TRANS_DQ_BCOUNT, + (long) args.len); + } else { + ap->blkno = NULLFSBLOCK; + ap->length = 0; + } + return 0; +} + +/* + * xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file. + * It figures out where to ask the underlying allocator to put the new extent. + */ +STATIC int +xfs_bmap_alloc( + struct xfs_bmalloca *ap) /* bmap alloc argument struct */ +{ + if (XFS_IS_REALTIME_INODE(ap->ip) && ap->userdata) + return xfs_bmap_rtalloc(ap); + return xfs_bmap_btalloc(ap); +} + +/* + * Trim the returned map to the required bounds + */ +STATIC void +xfs_bmapi_trim_map( + struct xfs_bmbt_irec *mval, + struct xfs_bmbt_irec *got, + xfs_fileoff_t *bno, + xfs_filblks_t len, + xfs_fileoff_t obno, + xfs_fileoff_t end, + int n, + int flags) +{ + if ((flags & XFS_BMAPI_ENTIRE) || + got->br_startoff + got->br_blockcount <= obno) { + *mval = *got; + if (isnullstartblock(got->br_startblock)) + mval->br_startblock = DELAYSTARTBLOCK; + return; + } + + if (obno > *bno) + *bno = obno; + ASSERT((*bno >= obno) || (n == 0)); + ASSERT(*bno < end); + mval->br_startoff = *bno; + if (isnullstartblock(got->br_startblock)) + mval->br_startblock = DELAYSTARTBLOCK; + else + mval->br_startblock = got->br_startblock + + (*bno - got->br_startoff); + /* + * Return the minimum of what we got and what we asked for for + * the length. We can use the len variable here because it is + * modified below and we could have been there before coming + * here if the first part of the allocation didn't overlap what + * was asked for. + */ + mval->br_blockcount = XFS_FILBLKS_MIN(end - *bno, + got->br_blockcount - (*bno - got->br_startoff)); + mval->br_state = got->br_state; + ASSERT(mval->br_blockcount <= len); + return; +} + +/* + * Update and validate the extent map to return + */ +STATIC void +xfs_bmapi_update_map( + struct xfs_bmbt_irec **map, + xfs_fileoff_t *bno, + xfs_filblks_t *len, + xfs_fileoff_t obno, + xfs_fileoff_t end, + int *n, + int flags) +{ + xfs_bmbt_irec_t *mval = *map; + + ASSERT((flags & XFS_BMAPI_ENTIRE) || + ((mval->br_startoff + mval->br_blockcount) <= end)); + ASSERT((flags & XFS_BMAPI_ENTIRE) || (mval->br_blockcount <= *len) || + (mval->br_startoff < obno)); + + *bno = mval->br_startoff + mval->br_blockcount; + *len = end - *bno; + if (*n > 0 && mval->br_startoff == mval[-1].br_startoff) { + /* update previous map with new information */ + ASSERT(mval->br_startblock == mval[-1].br_startblock); + ASSERT(mval->br_blockcount > mval[-1].br_blockcount); + ASSERT(mval->br_state == mval[-1].br_state); + mval[-1].br_blockcount = mval->br_blockcount; + mval[-1].br_state = mval->br_state; + } else if (*n > 0 && mval->br_startblock != DELAYSTARTBLOCK && + mval[-1].br_startblock != DELAYSTARTBLOCK && + mval[-1].br_startblock != HOLESTARTBLOCK && + mval->br_startblock == mval[-1].br_startblock + + mval[-1].br_blockcount && + ((flags & XFS_BMAPI_IGSTATE) || + mval[-1].br_state == mval->br_state)) { + ASSERT(mval->br_startoff == + mval[-1].br_startoff + mval[-1].br_blockcount); + mval[-1].br_blockcount += mval->br_blockcount; + } else if (*n > 0 && + mval->br_startblock == DELAYSTARTBLOCK && + mval[-1].br_startblock == DELAYSTARTBLOCK && + mval->br_startoff == + mval[-1].br_startoff + mval[-1].br_blockcount) { + mval[-1].br_blockcount += mval->br_blockcount; + mval[-1].br_state = mval->br_state; + } else if (!((*n == 0) && + ((mval->br_startoff + mval->br_blockcount) <= + obno))) { + mval++; + (*n)++; + } + *map = mval; +} + +/* + * Map file blocks to filesystem blocks without allocation. + */ +int +xfs_bmapi_read( + struct xfs_inode *ip, + xfs_fileoff_t bno, + xfs_filblks_t len, + struct xfs_bmbt_irec *mval, + int *nmap, + int flags) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp; + struct xfs_bmbt_irec got; + struct xfs_bmbt_irec prev; + xfs_fileoff_t obno; + xfs_fileoff_t end; + xfs_extnum_t lastx; + int error; + int eof; + int n = 0; + int whichfork = (flags & XFS_BMAPI_ATTRFORK) ? + XFS_ATTR_FORK : XFS_DATA_FORK; + + ASSERT(*nmap >= 1); + ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK|XFS_BMAPI_ENTIRE| + XFS_BMAPI_IGSTATE))); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)); + + if (unlikely(XFS_TEST_ERROR( + (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE), + mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) { + XFS_ERROR_REPORT("xfs_bmapi_read", XFS_ERRLEVEL_LOW, mp); + return -EFSCORRUPTED; + } + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + XFS_STATS_INC(xs_blk_mapr); + + ifp = XFS_IFORK_PTR(ip, whichfork); + + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + error = xfs_iread_extents(NULL, ip, whichfork); + if (error) + return error; + } + + xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, &prev); + end = bno + len; + obno = bno; + + while (bno < end && n < *nmap) { + /* Reading past eof, act as though there's a hole up to end. */ + if (eof) + got.br_startoff = end; + if (got.br_startoff > bno) { + /* Reading in a hole. */ + mval->br_startoff = bno; + mval->br_startblock = HOLESTARTBLOCK; + mval->br_blockcount = + XFS_FILBLKS_MIN(len, got.br_startoff - bno); + mval->br_state = XFS_EXT_NORM; + bno += mval->br_blockcount; + len -= mval->br_blockcount; + mval++; + n++; + continue; + } + + /* set up the extent map to return. */ + xfs_bmapi_trim_map(mval, &got, &bno, len, obno, end, n, flags); + xfs_bmapi_update_map(&mval, &bno, &len, obno, end, &n, flags); + + /* If we're done, stop now. */ + if (bno >= end || n >= *nmap) + break; + + /* Else go on to the next record. */ + if (++lastx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx), &got); + else + eof = 1; + } + *nmap = n; + return 0; +} + +STATIC int +xfs_bmapi_reserve_delalloc( + struct xfs_inode *ip, + xfs_fileoff_t aoff, + xfs_filblks_t len, + struct xfs_bmbt_irec *got, + struct xfs_bmbt_irec *prev, + xfs_extnum_t *lastx, + int eof) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + xfs_extlen_t alen; + xfs_extlen_t indlen; + char rt = XFS_IS_REALTIME_INODE(ip); + xfs_extlen_t extsz; + int error; + + alen = XFS_FILBLKS_MIN(len, MAXEXTLEN); + if (!eof) + alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff); + + /* Figure out the extent size, adjust alen */ + extsz = xfs_get_extsz_hint(ip); + if (extsz) { + error = xfs_bmap_extsize_align(mp, got, prev, extsz, rt, eof, + 1, 0, &aoff, &alen); + ASSERT(!error); + } + + if (rt) + extsz = alen / mp->m_sb.sb_rextsize; + + /* + * Make a transaction-less quota reservation for delayed allocation + * blocks. This number gets adjusted later. We return if we haven't + * allocated blocks already inside this loop. + */ + error = xfs_trans_reserve_quota_nblks(NULL, ip, (long)alen, 0, + rt ? XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS); + if (error) + return error; + + /* + * Split changing sb for alen and indlen since they could be coming + * from different places. + */ + indlen = (xfs_extlen_t)xfs_bmap_worst_indlen(ip, alen); + ASSERT(indlen > 0); + + if (rt) { + error = xfs_mod_frextents(mp, -((int64_t)extsz)); + } else { + error = xfs_mod_fdblocks(mp, -((int64_t)alen), false); + } + + if (error) + goto out_unreserve_quota; + + error = xfs_mod_fdblocks(mp, -((int64_t)indlen), false); + if (error) + goto out_unreserve_blocks; + + + ip->i_delayed_blks += alen; + + got->br_startoff = aoff; + got->br_startblock = nullstartblock(indlen); + got->br_blockcount = alen; + got->br_state = XFS_EXT_NORM; + xfs_bmap_add_extent_hole_delay(ip, lastx, got); + + /* + * Update our extent pointer, given that xfs_bmap_add_extent_hole_delay + * might have merged it into one of the neighbouring ones. + */ + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *lastx), got); + + ASSERT(got->br_startoff <= aoff); + ASSERT(got->br_startoff + got->br_blockcount >= aoff + alen); + ASSERT(isnullstartblock(got->br_startblock)); + ASSERT(got->br_state == XFS_EXT_NORM); + return 0; + +out_unreserve_blocks: + if (rt) + xfs_mod_frextents(mp, extsz); + else + xfs_mod_fdblocks(mp, alen, false); +out_unreserve_quota: + if (XFS_IS_QUOTA_ON(mp)) + xfs_trans_unreserve_quota_nblks(NULL, ip, (long)alen, 0, rt ? + XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS); + return error; +} + +/* + * Map file blocks to filesystem blocks, adding delayed allocations as needed. + */ +int +xfs_bmapi_delay( + struct xfs_inode *ip, /* incore inode */ + xfs_fileoff_t bno, /* starting file offs. mapped */ + xfs_filblks_t len, /* length to map in file */ + struct xfs_bmbt_irec *mval, /* output: map values */ + int *nmap, /* i/o: mval size/count */ + int flags) /* XFS_BMAPI_... */ +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + struct xfs_bmbt_irec got; /* current file extent record */ + struct xfs_bmbt_irec prev; /* previous file extent record */ + xfs_fileoff_t obno; /* old block number (offset) */ + xfs_fileoff_t end; /* end of mapped file region */ + xfs_extnum_t lastx; /* last useful extent number */ + int eof; /* we've hit the end of extents */ + int n = 0; /* current extent index */ + int error = 0; + + ASSERT(*nmap >= 1); + ASSERT(*nmap <= XFS_BMAP_MAX_NMAP); + ASSERT(!(flags & ~XFS_BMAPI_ENTIRE)); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + + if (unlikely(XFS_TEST_ERROR( + (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_BTREE), + mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) { + XFS_ERROR_REPORT("xfs_bmapi_delay", XFS_ERRLEVEL_LOW, mp); + return -EFSCORRUPTED; + } + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + XFS_STATS_INC(xs_blk_mapw); + + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK); + if (error) + return error; + } + + xfs_bmap_search_extents(ip, bno, XFS_DATA_FORK, &eof, &lastx, &got, &prev); + end = bno + len; + obno = bno; + + while (bno < end && n < *nmap) { + if (eof || got.br_startoff > bno) { + error = xfs_bmapi_reserve_delalloc(ip, bno, len, &got, + &prev, &lastx, eof); + if (error) { + if (n == 0) { + *nmap = 0; + return error; + } + break; + } + } + + /* set up the extent map to return. */ + xfs_bmapi_trim_map(mval, &got, &bno, len, obno, end, n, flags); + xfs_bmapi_update_map(&mval, &bno, &len, obno, end, &n, flags); + + /* If we're done, stop now. */ + if (bno >= end || n >= *nmap) + break; + + /* Else go on to the next record. */ + prev = got; + if (++lastx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx), &got); + else + eof = 1; + } + + *nmap = n; + return 0; +} + + +static int +xfs_bmapi_allocate( + struct xfs_bmalloca *bma) +{ + struct xfs_mount *mp = bma->ip->i_mount; + int whichfork = (bma->flags & XFS_BMAPI_ATTRFORK) ? + XFS_ATTR_FORK : XFS_DATA_FORK; + struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork); + int tmp_logflags = 0; + int error; + + ASSERT(bma->length > 0); + + /* + * For the wasdelay case, we could also just allocate the stuff asked + * for in this bmap call but that wouldn't be as good. + */ + if (bma->wasdel) { + bma->length = (xfs_extlen_t)bma->got.br_blockcount; + bma->offset = bma->got.br_startoff; + if (bma->idx != NULLEXTNUM && bma->idx) { + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1), + &bma->prev); + } + } else { + bma->length = XFS_FILBLKS_MIN(bma->length, MAXEXTLEN); + if (!bma->eof) + bma->length = XFS_FILBLKS_MIN(bma->length, + bma->got.br_startoff - bma->offset); + } + + /* + * Indicate if this is the first user data in the file, or just any + * user data. + */ + if (!(bma->flags & XFS_BMAPI_METADATA)) { + bma->userdata = (bma->offset == 0) ? + XFS_ALLOC_INITIAL_USER_DATA : XFS_ALLOC_USERDATA; + } + + bma->minlen = (bma->flags & XFS_BMAPI_CONTIG) ? bma->length : 1; + + /* + * Only want to do the alignment at the eof if it is userdata and + * allocation length is larger than a stripe unit. + */ + if (mp->m_dalign && bma->length >= mp->m_dalign && + !(bma->flags & XFS_BMAPI_METADATA) && whichfork == XFS_DATA_FORK) { + error = xfs_bmap_isaeof(bma, whichfork); + if (error) + return error; + } + + error = xfs_bmap_alloc(bma); + if (error) + return error; + + if (bma->flist->xbf_low) + bma->minleft = 0; + if (bma->cur) + bma->cur->bc_private.b.firstblock = *bma->firstblock; + if (bma->blkno == NULLFSBLOCK) + return 0; + if ((ifp->if_flags & XFS_IFBROOT) && !bma->cur) { + bma->cur = xfs_bmbt_init_cursor(mp, bma->tp, bma->ip, whichfork); + bma->cur->bc_private.b.firstblock = *bma->firstblock; + bma->cur->bc_private.b.flist = bma->flist; + } + /* + * Bump the number of extents we've allocated + * in this call. + */ + bma->nallocs++; + + if (bma->cur) + bma->cur->bc_private.b.flags = + bma->wasdel ? XFS_BTCUR_BPRV_WASDEL : 0; + + bma->got.br_startoff = bma->offset; + bma->got.br_startblock = bma->blkno; + bma->got.br_blockcount = bma->length; + bma->got.br_state = XFS_EXT_NORM; + + /* + * A wasdelay extent has been initialized, so shouldn't be flagged + * as unwritten. + */ + if (!bma->wasdel && (bma->flags & XFS_BMAPI_PREALLOC) && + xfs_sb_version_hasextflgbit(&mp->m_sb)) + bma->got.br_state = XFS_EXT_UNWRITTEN; + + if (bma->wasdel) + error = xfs_bmap_add_extent_delay_real(bma); + else + error = xfs_bmap_add_extent_hole_real(bma, whichfork); + + bma->logflags |= tmp_logflags; + if (error) + return error; + + /* + * Update our extent pointer, given that xfs_bmap_add_extent_delay_real + * or xfs_bmap_add_extent_hole_real might have merged it into one of + * the neighbouring ones. + */ + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &bma->got); + + ASSERT(bma->got.br_startoff <= bma->offset); + ASSERT(bma->got.br_startoff + bma->got.br_blockcount >= + bma->offset + bma->length); + ASSERT(bma->got.br_state == XFS_EXT_NORM || + bma->got.br_state == XFS_EXT_UNWRITTEN); + return 0; +} + +STATIC int +xfs_bmapi_convert_unwritten( + struct xfs_bmalloca *bma, + struct xfs_bmbt_irec *mval, + xfs_filblks_t len, + int flags) +{ + int whichfork = (flags & XFS_BMAPI_ATTRFORK) ? + XFS_ATTR_FORK : XFS_DATA_FORK; + struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork); + int tmp_logflags = 0; + int error; + + /* check if we need to do unwritten->real conversion */ + if (mval->br_state == XFS_EXT_UNWRITTEN && + (flags & XFS_BMAPI_PREALLOC)) + return 0; + + /* check if we need to do real->unwritten conversion */ + if (mval->br_state == XFS_EXT_NORM && + (flags & (XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT)) != + (XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT)) + return 0; + + /* + * Modify (by adding) the state flag, if writing. + */ + ASSERT(mval->br_blockcount <= len); + if ((ifp->if_flags & XFS_IFBROOT) && !bma->cur) { + bma->cur = xfs_bmbt_init_cursor(bma->ip->i_mount, bma->tp, + bma->ip, whichfork); + bma->cur->bc_private.b.firstblock = *bma->firstblock; + bma->cur->bc_private.b.flist = bma->flist; + } + mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN) + ? XFS_EXT_NORM : XFS_EXT_UNWRITTEN; + + error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, &bma->idx, + &bma->cur, mval, bma->firstblock, bma->flist, + &tmp_logflags); + bma->logflags |= tmp_logflags; + if (error) + return error; + + /* + * Update our extent pointer, given that + * xfs_bmap_add_extent_unwritten_real might have merged it into one + * of the neighbouring ones. + */ + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &bma->got); + + /* + * We may have combined previously unwritten space with written space, + * so generate another request. + */ + if (mval->br_blockcount < len) + return -EAGAIN; + return 0; +} + +/* + * Map file blocks to filesystem blocks, and allocate blocks or convert the + * extent state if necessary. Details behaviour is controlled by the flags + * parameter. Only allocates blocks from a single allocation group, to avoid + * locking problems. + * + * The returned value in "firstblock" from the first call in a transaction + * must be remembered and presented to subsequent calls in "firstblock". + * An upper bound for the number of blocks to be allocated is supplied to + * the first call in "total"; if no allocation group has that many free + * blocks then the call will fail (return NULLFSBLOCK in "firstblock"). + */ +int +xfs_bmapi_write( + struct xfs_trans *tp, /* transaction pointer */ + struct xfs_inode *ip, /* incore inode */ + xfs_fileoff_t bno, /* starting file offs. mapped */ + xfs_filblks_t len, /* length to map in file */ + int flags, /* XFS_BMAPI_... */ + xfs_fsblock_t *firstblock, /* first allocated block + controls a.g. for allocs */ + xfs_extlen_t total, /* total blocks needed */ + struct xfs_bmbt_irec *mval, /* output: map values */ + int *nmap, /* i/o: mval size/count */ + struct xfs_bmap_free *flist) /* i/o: list extents to free */ +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp; + struct xfs_bmalloca bma = { NULL }; /* args for xfs_bmap_alloc */ + xfs_fileoff_t end; /* end of mapped file region */ + int eof; /* after the end of extents */ + int error; /* error return */ + int n; /* current extent index */ + xfs_fileoff_t obno; /* old block number (offset) */ + int whichfork; /* data or attr fork */ + char inhole; /* current location is hole in file */ + char wasdelay; /* old extent was delayed */ + +#ifdef DEBUG + xfs_fileoff_t orig_bno; /* original block number value */ + int orig_flags; /* original flags arg value */ + xfs_filblks_t orig_len; /* original value of len arg */ + struct xfs_bmbt_irec *orig_mval; /* original value of mval */ + int orig_nmap; /* original value of *nmap */ + + orig_bno = bno; + orig_len = len; + orig_flags = flags; + orig_mval = mval; + orig_nmap = *nmap; +#endif + whichfork = (flags & XFS_BMAPI_ATTRFORK) ? + XFS_ATTR_FORK : XFS_DATA_FORK; + + ASSERT(*nmap >= 1); + ASSERT(*nmap <= XFS_BMAP_MAX_NMAP); + ASSERT(!(flags & XFS_BMAPI_IGSTATE)); + ASSERT(tp != NULL); + ASSERT(len > 0); + ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + + if (unlikely(XFS_TEST_ERROR( + (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE), + mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) { + XFS_ERROR_REPORT("xfs_bmapi_write", XFS_ERRLEVEL_LOW, mp); + return -EFSCORRUPTED; + } + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + ifp = XFS_IFORK_PTR(ip, whichfork); + + XFS_STATS_INC(xs_blk_mapw); + + if (*firstblock == NULLFSBLOCK) { + if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE) + bma.minleft = be16_to_cpu(ifp->if_broot->bb_level) + 1; + else + bma.minleft = 1; + } else { + bma.minleft = 0; + } + + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + error = xfs_iread_extents(tp, ip, whichfork); + if (error) + goto error0; + } + + xfs_bmap_search_extents(ip, bno, whichfork, &eof, &bma.idx, &bma.got, + &bma.prev); + n = 0; + end = bno + len; + obno = bno; + + bma.tp = tp; + bma.ip = ip; + bma.total = total; + bma.userdata = 0; + bma.flist = flist; + bma.firstblock = firstblock; + + while (bno < end && n < *nmap) { + inhole = eof || bma.got.br_startoff > bno; + wasdelay = !inhole && isnullstartblock(bma.got.br_startblock); + + /* + * First, deal with the hole before the allocated space + * that we found, if any. + */ + if (inhole || wasdelay) { + bma.eof = eof; + bma.conv = !!(flags & XFS_BMAPI_CONVERT); + bma.wasdel = wasdelay; + bma.offset = bno; + bma.flags = flags; + + /* + * There's a 32/64 bit type mismatch between the + * allocation length request (which can be 64 bits in + * length) and the bma length request, which is + * xfs_extlen_t and therefore 32 bits. Hence we have to + * check for 32-bit overflows and handle them here. + */ + if (len > (xfs_filblks_t)MAXEXTLEN) + bma.length = MAXEXTLEN; + else + bma.length = len; + + ASSERT(len > 0); + ASSERT(bma.length > 0); + error = xfs_bmapi_allocate(&bma); + if (error) + goto error0; + if (bma.blkno == NULLFSBLOCK) + break; + } + + /* Deal with the allocated space we found. */ + xfs_bmapi_trim_map(mval, &bma.got, &bno, len, obno, + end, n, flags); + + /* Execute unwritten extent conversion if necessary */ + error = xfs_bmapi_convert_unwritten(&bma, mval, len, flags); + if (error == -EAGAIN) + continue; + if (error) + goto error0; + + /* update the extent map to return */ + xfs_bmapi_update_map(&mval, &bno, &len, obno, end, &n, flags); + + /* + * If we're done, stop now. Stop when we've allocated + * XFS_BMAP_MAX_NMAP extents no matter what. Otherwise + * the transaction may get too big. + */ + if (bno >= end || n >= *nmap || bma.nallocs >= *nmap) + break; + + /* Else go on to the next record. */ + bma.prev = bma.got; + if (++bma.idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) { + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma.idx), + &bma.got); + } else + eof = 1; + } + *nmap = n; + + /* + * Transform from btree to extents, give it cur. + */ + if (xfs_bmap_wants_extents(ip, whichfork)) { + int tmp_logflags = 0; + + ASSERT(bma.cur); + error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, + &tmp_logflags, whichfork); + bma.logflags |= tmp_logflags; + if (error) + goto error0; + } + + ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE || + XFS_IFORK_NEXTENTS(ip, whichfork) > + XFS_IFORK_MAXEXT(ip, whichfork)); + error = 0; +error0: + /* + * Log everything. Do this after conversion, there's no point in + * logging the extent records if we've converted to btree format. + */ + if ((bma.logflags & xfs_ilog_fext(whichfork)) && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS) + bma.logflags &= ~xfs_ilog_fext(whichfork); + else if ((bma.logflags & xfs_ilog_fbroot(whichfork)) && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) + bma.logflags &= ~xfs_ilog_fbroot(whichfork); + /* + * Log whatever the flags say, even if error. Otherwise we might miss + * detecting a case where the data is changed, there's an error, + * and it's not logged so we don't shutdown when we should. + */ + if (bma.logflags) + xfs_trans_log_inode(tp, ip, bma.logflags); + + if (bma.cur) { + if (!error) { + ASSERT(*firstblock == NULLFSBLOCK || + XFS_FSB_TO_AGNO(mp, *firstblock) == + XFS_FSB_TO_AGNO(mp, + bma.cur->bc_private.b.firstblock) || + (flist->xbf_low && + XFS_FSB_TO_AGNO(mp, *firstblock) < + XFS_FSB_TO_AGNO(mp, + bma.cur->bc_private.b.firstblock))); + *firstblock = bma.cur->bc_private.b.firstblock; + } + xfs_btree_del_cursor(bma.cur, + error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); + } + if (!error) + xfs_bmap_validate_ret(orig_bno, orig_len, orig_flags, orig_mval, + orig_nmap, *nmap); + return error; +} + +/* + * Called by xfs_bmapi to update file extent records and the btree + * after removing space (or undoing a delayed allocation). + */ +STATIC int /* error */ +xfs_bmap_del_extent( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_trans_t *tp, /* current transaction pointer */ + xfs_extnum_t *idx, /* extent number to update/delete */ + xfs_bmap_free_t *flist, /* list of extents to be freed */ + xfs_btree_cur_t *cur, /* if null, not a btree */ + xfs_bmbt_irec_t *del, /* data to remove from extents */ + int *logflagsp, /* inode logging flags */ + int whichfork) /* data or attr fork */ +{ + xfs_filblks_t da_new; /* new delay-alloc indirect blocks */ + xfs_filblks_t da_old; /* old delay-alloc indirect blocks */ + xfs_fsblock_t del_endblock=0; /* first block past del */ + xfs_fileoff_t del_endoff; /* first offset past del */ + int delay; /* current block is delayed allocated */ + int do_fx; /* free extent at end of routine */ + xfs_bmbt_rec_host_t *ep; /* current extent entry pointer */ + int error; /* error return value */ + int flags; /* inode logging flags */ + xfs_bmbt_irec_t got; /* current extent entry */ + xfs_fileoff_t got_endoff; /* first offset past got */ + int i; /* temp state */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_mount_t *mp; /* mount structure */ + xfs_filblks_t nblks; /* quota/sb block count */ + xfs_bmbt_irec_t new; /* new record to be inserted */ + /* REFERENCED */ + uint qfield; /* quota field to update */ + xfs_filblks_t temp; /* for indirect length calculations */ + xfs_filblks_t temp2; /* for indirect length calculations */ + int state = 0; + + XFS_STATS_INC(xs_del_exlist); + + if (whichfork == XFS_ATTR_FORK) + state |= BMAP_ATTRFORK; + + mp = ip->i_mount; + ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT((*idx >= 0) && (*idx < ifp->if_bytes / + (uint)sizeof(xfs_bmbt_rec_t))); + ASSERT(del->br_blockcount > 0); + ep = xfs_iext_get_ext(ifp, *idx); + xfs_bmbt_get_all(ep, &got); + ASSERT(got.br_startoff <= del->br_startoff); + del_endoff = del->br_startoff + del->br_blockcount; + got_endoff = got.br_startoff + got.br_blockcount; + ASSERT(got_endoff >= del_endoff); + delay = isnullstartblock(got.br_startblock); + ASSERT(isnullstartblock(del->br_startblock) == delay); + flags = 0; + qfield = 0; + error = 0; + /* + * If deleting a real allocation, must free up the disk space. + */ + if (!delay) { + flags = XFS_ILOG_CORE; + /* + * Realtime allocation. Free it and record di_nblocks update. + */ + if (whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip)) { + xfs_fsblock_t bno; + xfs_filblks_t len; + + ASSERT(do_mod(del->br_blockcount, + mp->m_sb.sb_rextsize) == 0); + ASSERT(do_mod(del->br_startblock, + mp->m_sb.sb_rextsize) == 0); + bno = del->br_startblock; + len = del->br_blockcount; + do_div(bno, mp->m_sb.sb_rextsize); + do_div(len, mp->m_sb.sb_rextsize); + error = xfs_rtfree_extent(tp, bno, (xfs_extlen_t)len); + if (error) + goto done; + do_fx = 0; + nblks = len * mp->m_sb.sb_rextsize; + qfield = XFS_TRANS_DQ_RTBCOUNT; + } + /* + * Ordinary allocation. + */ + else { + do_fx = 1; + nblks = del->br_blockcount; + qfield = XFS_TRANS_DQ_BCOUNT; + } + /* + * Set up del_endblock and cur for later. + */ + del_endblock = del->br_startblock + del->br_blockcount; + if (cur) { + if ((error = xfs_bmbt_lookup_eq(cur, got.br_startoff, + got.br_startblock, got.br_blockcount, + &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + } + da_old = da_new = 0; + } else { + da_old = startblockval(got.br_startblock); + da_new = 0; + nblks = 0; + do_fx = 0; + } + /* + * Set flag value to use in switch statement. + * Left-contig is 2, right-contig is 1. + */ + switch (((got.br_startoff == del->br_startoff) << 1) | + (got_endoff == del_endoff)) { + case 3: + /* + * Matches the whole extent. Delete the entry. + */ + xfs_iext_remove(ip, *idx, 1, + whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0); + --*idx; + if (delay) + break; + + XFS_IFORK_NEXT_SET(ip, whichfork, + XFS_IFORK_NEXTENTS(ip, whichfork) - 1); + flags |= XFS_ILOG_CORE; + if (!cur) { + flags |= xfs_ilog_fext(whichfork); + break; + } + if ((error = xfs_btree_delete(cur, &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + break; + + case 2: + /* + * Deleting the first part of the extent. + */ + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_startoff(ep, del_endoff); + temp = got.br_blockcount - del->br_blockcount; + xfs_bmbt_set_blockcount(ep, temp); + if (delay) { + temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + da_old); + xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + da_new = temp; + break; + } + xfs_bmbt_set_startblock(ep, del_endblock); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + if (!cur) { + flags |= xfs_ilog_fext(whichfork); + break; + } + if ((error = xfs_bmbt_update(cur, del_endoff, del_endblock, + got.br_blockcount - del->br_blockcount, + got.br_state))) + goto done; + break; + + case 1: + /* + * Deleting the last part of the extent. + */ + temp = got.br_blockcount - del->br_blockcount; + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(ep, temp); + if (delay) { + temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + da_old); + xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + da_new = temp; + break; + } + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + if (!cur) { + flags |= xfs_ilog_fext(whichfork); + break; + } + if ((error = xfs_bmbt_update(cur, got.br_startoff, + got.br_startblock, + got.br_blockcount - del->br_blockcount, + got.br_state))) + goto done; + break; + + case 0: + /* + * Deleting the middle of the extent. + */ + temp = del->br_startoff - got.br_startoff; + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(ep, temp); + new.br_startoff = del_endoff; + temp2 = got_endoff - del_endoff; + new.br_blockcount = temp2; + new.br_state = got.br_state; + if (!delay) { + new.br_startblock = del_endblock; + flags |= XFS_ILOG_CORE; + if (cur) { + if ((error = xfs_bmbt_update(cur, + got.br_startoff, + got.br_startblock, temp, + got.br_state))) + goto done; + if ((error = xfs_btree_increment(cur, 0, &i))) + goto done; + cur->bc_rec.b = new; + error = xfs_btree_insert(cur, &i); + if (error && error != -ENOSPC) + goto done; + /* + * If get no-space back from btree insert, + * it tried a split, and we have a zero + * block reservation. + * Fix up our state and return the error. + */ + if (error == -ENOSPC) { + /* + * Reset the cursor, don't trust + * it after any insert operation. + */ + if ((error = xfs_bmbt_lookup_eq(cur, + got.br_startoff, + got.br_startblock, + temp, &i))) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, + i == 1, done); + /* + * Update the btree record back + * to the original value. + */ + if ((error = xfs_bmbt_update(cur, + got.br_startoff, + got.br_startblock, + got.br_blockcount, + got.br_state))) + goto done; + /* + * Reset the extent record back + * to the original value. + */ + xfs_bmbt_set_blockcount(ep, + got.br_blockcount); + flags = 0; + error = -ENOSPC; + goto done; + } + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + } else + flags |= xfs_ilog_fext(whichfork); + XFS_IFORK_NEXT_SET(ip, whichfork, + XFS_IFORK_NEXTENTS(ip, whichfork) + 1); + } else { + ASSERT(whichfork == XFS_DATA_FORK); + temp = xfs_bmap_worst_indlen(ip, temp); + xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); + temp2 = xfs_bmap_worst_indlen(ip, temp2); + new.br_startblock = nullstartblock((int)temp2); + da_new = temp + temp2; + while (da_new > da_old) { + if (temp) { + temp--; + da_new--; + xfs_bmbt_set_startblock(ep, + nullstartblock((int)temp)); + } + if (da_new == da_old) + break; + if (temp2) { + temp2--; + da_new--; + new.br_startblock = + nullstartblock((int)temp2); + } + } + } + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + xfs_iext_insert(ip, *idx + 1, 1, &new, state); + ++*idx; + break; + } + /* + * If we need to, add to list of extents to delete. + */ + if (do_fx) + xfs_bmap_add_free(del->br_startblock, del->br_blockcount, flist, + mp); + /* + * Adjust inode # blocks in the file. + */ + if (nblks) + ip->i_d.di_nblocks -= nblks; + /* + * Adjust quota data. + */ + if (qfield) + xfs_trans_mod_dquot_byino(tp, ip, qfield, (long)-nblks); + + /* + * Account for change in delayed indirect blocks. + * Nothing to do for disk quota accounting here. + */ + ASSERT(da_old >= da_new); + if (da_old > da_new) + xfs_mod_fdblocks(mp, (int64_t)(da_old - da_new), false); +done: + *logflagsp = flags; + return error; +} + +/* + * Unmap (remove) blocks from a file. + * If nexts is nonzero then the number of extents to remove is limited to + * that value. If not all extents in the block range can be removed then + * *done is set. + */ +int /* error */ +xfs_bunmapi( + xfs_trans_t *tp, /* transaction pointer */ + struct xfs_inode *ip, /* incore inode */ + xfs_fileoff_t bno, /* starting offset to unmap */ + xfs_filblks_t len, /* length to unmap in file */ + int flags, /* misc flags */ + xfs_extnum_t nexts, /* number of extents max */ + xfs_fsblock_t *firstblock, /* first allocated block + controls a.g. for allocs */ + xfs_bmap_free_t *flist, /* i/o: list extents to free */ + int *done) /* set if not done yet */ +{ + xfs_btree_cur_t *cur; /* bmap btree cursor */ + xfs_bmbt_irec_t del; /* extent being deleted */ + int eof; /* is deleting at eof */ + xfs_bmbt_rec_host_t *ep; /* extent record pointer */ + int error; /* error return value */ + xfs_extnum_t extno; /* extent number in list */ + xfs_bmbt_irec_t got; /* current extent record */ + xfs_ifork_t *ifp; /* inode fork pointer */ + int isrt; /* freeing in rt area */ + xfs_extnum_t lastx; /* last extent index used */ + int logflags; /* transaction logging flags */ + xfs_extlen_t mod; /* rt extent offset */ + xfs_mount_t *mp; /* mount structure */ + xfs_extnum_t nextents; /* number of file extents */ + xfs_bmbt_irec_t prev; /* previous extent record */ + xfs_fileoff_t start; /* first file offset deleted */ + int tmp_logflags; /* partial logging flags */ + int wasdel; /* was a delayed alloc extent */ + int whichfork; /* data or attribute fork */ + xfs_fsblock_t sum; + + trace_xfs_bunmap(ip, bno, len, flags, _RET_IP_); + + whichfork = (flags & XFS_BMAPI_ATTRFORK) ? + XFS_ATTR_FORK : XFS_DATA_FORK; + ifp = XFS_IFORK_PTR(ip, whichfork); + if (unlikely( + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) { + XFS_ERROR_REPORT("xfs_bunmapi", XFS_ERRLEVEL_LOW, + ip->i_mount); + return -EFSCORRUPTED; + } + mp = ip->i_mount; + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + ASSERT(len > 0); + ASSERT(nexts >= 0); + + if (!(ifp->if_flags & XFS_IFEXTENTS) && + (error = xfs_iread_extents(tp, ip, whichfork))) + return error; + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + if (nextents == 0) { + *done = 1; + return 0; + } + XFS_STATS_INC(xs_blk_unmap); + isrt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip); + start = bno; + bno = start + len - 1; + ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, + &prev); + + /* + * Check to see if the given block number is past the end of the + * file, back up to the last block if so... + */ + if (eof) { + ep = xfs_iext_get_ext(ifp, --lastx); + xfs_bmbt_get_all(ep, &got); + bno = got.br_startoff + got.br_blockcount - 1; + } + logflags = 0; + if (ifp->if_flags & XFS_IFBROOT) { + ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE); + cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); + cur->bc_private.b.firstblock = *firstblock; + cur->bc_private.b.flist = flist; + cur->bc_private.b.flags = 0; + } else + cur = NULL; + + if (isrt) { + /* + * Synchronize by locking the bitmap inode. + */ + xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL); + } + + extno = 0; + while (bno != (xfs_fileoff_t)-1 && bno >= start && lastx >= 0 && + (nexts == 0 || extno < nexts)) { + /* + * Is the found extent after a hole in which bno lives? + * Just back up to the previous extent, if so. + */ + if (got.br_startoff > bno) { + if (--lastx < 0) + break; + ep = xfs_iext_get_ext(ifp, lastx); + xfs_bmbt_get_all(ep, &got); + } + /* + * Is the last block of this extent before the range + * we're supposed to delete? If so, we're done. + */ + bno = XFS_FILEOFF_MIN(bno, + got.br_startoff + got.br_blockcount - 1); + if (bno < start) + break; + /* + * Then deal with the (possibly delayed) allocated space + * we found. + */ + ASSERT(ep != NULL); + del = got; + wasdel = isnullstartblock(del.br_startblock); + if (got.br_startoff < start) { + del.br_startoff = start; + del.br_blockcount -= start - got.br_startoff; + if (!wasdel) + del.br_startblock += start - got.br_startoff; + } + if (del.br_startoff + del.br_blockcount > bno + 1) + del.br_blockcount = bno + 1 - del.br_startoff; + sum = del.br_startblock + del.br_blockcount; + if (isrt && + (mod = do_mod(sum, mp->m_sb.sb_rextsize))) { + /* + * Realtime extent not lined up at the end. + * The extent could have been split into written + * and unwritten pieces, or we could just be + * unmapping part of it. But we can't really + * get rid of part of a realtime extent. + */ + if (del.br_state == XFS_EXT_UNWRITTEN || + !xfs_sb_version_hasextflgbit(&mp->m_sb)) { + /* + * This piece is unwritten, or we're not + * using unwritten extents. Skip over it. + */ + ASSERT(bno >= mod); + bno -= mod > del.br_blockcount ? + del.br_blockcount : mod; + if (bno < got.br_startoff) { + if (--lastx >= 0) + xfs_bmbt_get_all(xfs_iext_get_ext( + ifp, lastx), &got); + } + continue; + } + /* + * It's written, turn it unwritten. + * This is better than zeroing it. + */ + ASSERT(del.br_state == XFS_EXT_NORM); + ASSERT(xfs_trans_get_block_res(tp) > 0); + /* + * If this spans a realtime extent boundary, + * chop it back to the start of the one we end at. + */ + if (del.br_blockcount > mod) { + del.br_startoff += del.br_blockcount - mod; + del.br_startblock += del.br_blockcount - mod; + del.br_blockcount = mod; + } + del.br_state = XFS_EXT_UNWRITTEN; + error = xfs_bmap_add_extent_unwritten_real(tp, ip, + &lastx, &cur, &del, firstblock, flist, + &logflags); + if (error) + goto error0; + goto nodelete; + } + if (isrt && (mod = do_mod(del.br_startblock, mp->m_sb.sb_rextsize))) { + /* + * Realtime extent is lined up at the end but not + * at the front. We'll get rid of full extents if + * we can. + */ + mod = mp->m_sb.sb_rextsize - mod; + if (del.br_blockcount > mod) { + del.br_blockcount -= mod; + del.br_startoff += mod; + del.br_startblock += mod; + } else if ((del.br_startoff == start && + (del.br_state == XFS_EXT_UNWRITTEN || + xfs_trans_get_block_res(tp) == 0)) || + !xfs_sb_version_hasextflgbit(&mp->m_sb)) { + /* + * Can't make it unwritten. There isn't + * a full extent here so just skip it. + */ + ASSERT(bno >= del.br_blockcount); + bno -= del.br_blockcount; + if (got.br_startoff > bno) { + if (--lastx >= 0) { + ep = xfs_iext_get_ext(ifp, + lastx); + xfs_bmbt_get_all(ep, &got); + } + } + continue; + } else if (del.br_state == XFS_EXT_UNWRITTEN) { + /* + * This one is already unwritten. + * It must have a written left neighbor. + * Unwrite the killed part of that one and + * try again. + */ + ASSERT(lastx > 0); + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, + lastx - 1), &prev); + ASSERT(prev.br_state == XFS_EXT_NORM); + ASSERT(!isnullstartblock(prev.br_startblock)); + ASSERT(del.br_startblock == + prev.br_startblock + prev.br_blockcount); + if (prev.br_startoff < start) { + mod = start - prev.br_startoff; + prev.br_blockcount -= mod; + prev.br_startblock += mod; + prev.br_startoff = start; + } + prev.br_state = XFS_EXT_UNWRITTEN; + lastx--; + error = xfs_bmap_add_extent_unwritten_real(tp, + ip, &lastx, &cur, &prev, + firstblock, flist, &logflags); + if (error) + goto error0; + goto nodelete; + } else { + ASSERT(del.br_state == XFS_EXT_NORM); + del.br_state = XFS_EXT_UNWRITTEN; + error = xfs_bmap_add_extent_unwritten_real(tp, + ip, &lastx, &cur, &del, + firstblock, flist, &logflags); + if (error) + goto error0; + goto nodelete; + } + } + if (wasdel) { + ASSERT(startblockval(del.br_startblock) > 0); + /* Update realtime/data freespace, unreserve quota */ + if (isrt) { + xfs_filblks_t rtexts; + + rtexts = XFS_FSB_TO_B(mp, del.br_blockcount); + do_div(rtexts, mp->m_sb.sb_rextsize); + xfs_mod_frextents(mp, (int64_t)rtexts); + (void)xfs_trans_reserve_quota_nblks(NULL, + ip, -((long)del.br_blockcount), 0, + XFS_QMOPT_RES_RTBLKS); + } else { + xfs_mod_fdblocks(mp, (int64_t)del.br_blockcount, + false); + (void)xfs_trans_reserve_quota_nblks(NULL, + ip, -((long)del.br_blockcount), 0, + XFS_QMOPT_RES_REGBLKS); + } + ip->i_delayed_blks -= del.br_blockcount; + if (cur) + cur->bc_private.b.flags |= + XFS_BTCUR_BPRV_WASDEL; + } else if (cur) + cur->bc_private.b.flags &= ~XFS_BTCUR_BPRV_WASDEL; + /* + * If it's the case where the directory code is running + * with no block reservation, and the deleted block is in + * the middle of its extent, and the resulting insert + * of an extent would cause transformation to btree format, + * then reject it. The calling code will then swap + * blocks around instead. + * We have to do this now, rather than waiting for the + * conversion to btree format, since the transaction + * will be dirty. + */ + if (!wasdel && xfs_trans_get_block_res(tp) == 0 && + XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_NEXTENTS(ip, whichfork) >= /* Note the >= */ + XFS_IFORK_MAXEXT(ip, whichfork) && + del.br_startoff > got.br_startoff && + del.br_startoff + del.br_blockcount < + got.br_startoff + got.br_blockcount) { + error = -ENOSPC; + goto error0; + } + error = xfs_bmap_del_extent(ip, tp, &lastx, flist, cur, &del, + &tmp_logflags, whichfork); + logflags |= tmp_logflags; + if (error) + goto error0; + bno = del.br_startoff - 1; +nodelete: + /* + * If not done go on to the next (previous) record. + */ + if (bno != (xfs_fileoff_t)-1 && bno >= start) { + if (lastx >= 0) { + ep = xfs_iext_get_ext(ifp, lastx); + if (xfs_bmbt_get_startoff(ep) > bno) { + if (--lastx >= 0) + ep = xfs_iext_get_ext(ifp, + lastx); + } + xfs_bmbt_get_all(ep, &got); + } + extno++; + } + } + *done = bno == (xfs_fileoff_t)-1 || bno < start || lastx < 0; + + /* + * Convert to a btree if necessary. + */ + if (xfs_bmap_needs_btree(ip, whichfork)) { + ASSERT(cur == NULL); + error = xfs_bmap_extents_to_btree(tp, ip, firstblock, flist, + &cur, 0, &tmp_logflags, whichfork); + logflags |= tmp_logflags; + if (error) + goto error0; + } + /* + * transform from btree to extents, give it cur + */ + else if (xfs_bmap_wants_extents(ip, whichfork)) { + ASSERT(cur != NULL); + error = xfs_bmap_btree_to_extents(tp, ip, cur, &tmp_logflags, + whichfork); + logflags |= tmp_logflags; + if (error) + goto error0; + } + /* + * transform from extents to local? + */ + error = 0; +error0: + /* + * Log everything. Do this after conversion, there's no point in + * logging the extent records if we've converted to btree format. + */ + if ((logflags & xfs_ilog_fext(whichfork)) && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS) + logflags &= ~xfs_ilog_fext(whichfork); + else if ((logflags & xfs_ilog_fbroot(whichfork)) && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) + logflags &= ~xfs_ilog_fbroot(whichfork); + /* + * Log inode even in the error case, if the transaction + * is dirty we'll need to shut down the filesystem. + */ + if (logflags) + xfs_trans_log_inode(tp, ip, logflags); + if (cur) { + if (!error) { + *firstblock = cur->bc_private.b.firstblock; + cur->bc_private.b.allocated = 0; + } + xfs_btree_del_cursor(cur, + error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); + } + return error; +} + +/* + * Determine whether an extent shift can be accomplished by a merge with the + * extent that precedes the target hole of the shift. + */ +STATIC bool +xfs_bmse_can_merge( + struct xfs_bmbt_irec *left, /* preceding extent */ + struct xfs_bmbt_irec *got, /* current extent to shift */ + xfs_fileoff_t shift) /* shift fsb */ +{ + xfs_fileoff_t startoff; + + startoff = got->br_startoff - shift; + + /* + * The extent, once shifted, must be adjacent in-file and on-disk with + * the preceding extent. + */ + if ((left->br_startoff + left->br_blockcount != startoff) || + (left->br_startblock + left->br_blockcount != got->br_startblock) || + (left->br_state != got->br_state) || + (left->br_blockcount + got->br_blockcount > MAXEXTLEN)) + return false; + + return true; +} + +/* + * A bmap extent shift adjusts the file offset of an extent to fill a preceding + * hole in the file. If an extent shift would result in the extent being fully + * adjacent to the extent that currently precedes the hole, we can merge with + * the preceding extent rather than do the shift. + * + * This function assumes the caller has verified a shift-by-merge is possible + * with the provided extents via xfs_bmse_can_merge(). + */ +STATIC int +xfs_bmse_merge( + struct xfs_inode *ip, + int whichfork, + xfs_fileoff_t shift, /* shift fsb */ + int current_ext, /* idx of gotp */ + struct xfs_bmbt_rec_host *gotp, /* extent to shift */ + struct xfs_bmbt_rec_host *leftp, /* preceding extent */ + struct xfs_btree_cur *cur, + int *logflags) /* output */ +{ + struct xfs_bmbt_irec got; + struct xfs_bmbt_irec left; + xfs_filblks_t blockcount; + int error, i; + struct xfs_mount *mp = ip->i_mount; + + xfs_bmbt_get_all(gotp, &got); + xfs_bmbt_get_all(leftp, &left); + blockcount = left.br_blockcount + got.br_blockcount; + + ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + ASSERT(xfs_bmse_can_merge(&left, &got, shift)); + + /* + * Merge the in-core extents. Note that the host record pointers and + * current_ext index are invalid once the extent has been removed via + * xfs_iext_remove(). + */ + xfs_bmbt_set_blockcount(leftp, blockcount); + xfs_iext_remove(ip, current_ext, 1, 0); + + /* + * Update the on-disk extent count, the btree if necessary and log the + * inode. + */ + XFS_IFORK_NEXT_SET(ip, whichfork, + XFS_IFORK_NEXTENTS(ip, whichfork) - 1); + *logflags |= XFS_ILOG_CORE; + if (!cur) { + *logflags |= XFS_ILOG_DEXT; + return 0; + } + + /* lookup and remove the extent to merge */ + error = xfs_bmbt_lookup_eq(cur, got.br_startoff, got.br_startblock, + got.br_blockcount, &i); + if (error) + return error; + XFS_WANT_CORRUPTED_RETURN(mp, i == 1); + + error = xfs_btree_delete(cur, &i); + if (error) + return error; + XFS_WANT_CORRUPTED_RETURN(mp, i == 1); + + /* lookup and update size of the previous extent */ + error = xfs_bmbt_lookup_eq(cur, left.br_startoff, left.br_startblock, + left.br_blockcount, &i); + if (error) + return error; + XFS_WANT_CORRUPTED_RETURN(mp, i == 1); + + left.br_blockcount = blockcount; + + return xfs_bmbt_update(cur, left.br_startoff, left.br_startblock, + left.br_blockcount, left.br_state); +} + +/* + * Shift a single extent. + */ +STATIC int +xfs_bmse_shift_one( + struct xfs_inode *ip, + int whichfork, + xfs_fileoff_t offset_shift_fsb, + int *current_ext, + struct xfs_bmbt_rec_host *gotp, + struct xfs_btree_cur *cur, + int *logflags, + enum shift_direction direction) +{ + struct xfs_ifork *ifp; + struct xfs_mount *mp; + xfs_fileoff_t startoff; + struct xfs_bmbt_rec_host *adj_irecp; + struct xfs_bmbt_irec got; + struct xfs_bmbt_irec adj_irec; + int error; + int i; + int total_extents; + + mp = ip->i_mount; + ifp = XFS_IFORK_PTR(ip, whichfork); + total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); + + xfs_bmbt_get_all(gotp, &got); + + /* delalloc extents should be prevented by caller */ + XFS_WANT_CORRUPTED_RETURN(mp, !isnullstartblock(got.br_startblock)); + + if (direction == SHIFT_LEFT) { + startoff = got.br_startoff - offset_shift_fsb; + + /* + * Check for merge if we've got an extent to the left, + * otherwise make sure there's enough room at the start + * of the file for the shift. + */ + if (!*current_ext) { + if (got.br_startoff < offset_shift_fsb) + return -EINVAL; + goto update_current_ext; + } + /* + * grab the left extent and check for a large + * enough hole. + */ + adj_irecp = xfs_iext_get_ext(ifp, *current_ext - 1); + xfs_bmbt_get_all(adj_irecp, &adj_irec); + + if (startoff < + adj_irec.br_startoff + adj_irec.br_blockcount) + return -EINVAL; + + /* check whether to merge the extent or shift it down */ + if (xfs_bmse_can_merge(&adj_irec, &got, + offset_shift_fsb)) { + return xfs_bmse_merge(ip, whichfork, offset_shift_fsb, + *current_ext, gotp, adj_irecp, + cur, logflags); + } + } else { + startoff = got.br_startoff + offset_shift_fsb; + /* nothing to move if this is the last extent */ + if (*current_ext >= (total_extents - 1)) + goto update_current_ext; + /* + * If this is not the last extent in the file, make sure there + * is enough room between current extent and next extent for + * accommodating the shift. + */ + adj_irecp = xfs_iext_get_ext(ifp, *current_ext + 1); + xfs_bmbt_get_all(adj_irecp, &adj_irec); + if (startoff + got.br_blockcount > adj_irec.br_startoff) + return -EINVAL; + /* + * Unlike a left shift (which involves a hole punch), + * a right shift does not modify extent neighbors + * in any way. We should never find mergeable extents + * in this scenario. Check anyways and warn if we + * encounter two extents that could be one. + */ + if (xfs_bmse_can_merge(&got, &adj_irec, offset_shift_fsb)) + WARN_ON_ONCE(1); + } + /* + * Increment the extent index for the next iteration, update the start + * offset of the in-core extent and update the btree if applicable. + */ +update_current_ext: + if (direction == SHIFT_LEFT) + (*current_ext)++; + else + (*current_ext)--; + xfs_bmbt_set_startoff(gotp, startoff); + *logflags |= XFS_ILOG_CORE; + if (!cur) { + *logflags |= XFS_ILOG_DEXT; + return 0; + } + + error = xfs_bmbt_lookup_eq(cur, got.br_startoff, got.br_startblock, + got.br_blockcount, &i); + if (error) + return error; + XFS_WANT_CORRUPTED_RETURN(mp, i == 1); + + got.br_startoff = startoff; + return xfs_bmbt_update(cur, got.br_startoff, got.br_startblock, + got.br_blockcount, got.br_state); +} + +/* + * Shift extent records to the left/right to cover/create a hole. + * + * The maximum number of extents to be shifted in a single operation is + * @num_exts. @stop_fsb specifies the file offset at which to stop shift and the + * file offset where we've left off is returned in @next_fsb. @offset_shift_fsb + * is the length by which each extent is shifted. If there is no hole to shift + * the extents into, this will be considered invalid operation and we abort + * immediately. + */ +int +xfs_bmap_shift_extents( + struct xfs_trans *tp, + struct xfs_inode *ip, + xfs_fileoff_t *next_fsb, + xfs_fileoff_t offset_shift_fsb, + int *done, + xfs_fileoff_t stop_fsb, + xfs_fsblock_t *firstblock, + struct xfs_bmap_free *flist, + enum shift_direction direction, + int num_exts) +{ + struct xfs_btree_cur *cur = NULL; + struct xfs_bmbt_rec_host *gotp; + struct xfs_bmbt_irec got; + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp; + xfs_extnum_t nexts = 0; + xfs_extnum_t current_ext; + xfs_extnum_t total_extents; + xfs_extnum_t stop_extent; + int error = 0; + int whichfork = XFS_DATA_FORK; + int logflags = 0; + + if (unlikely(XFS_TEST_ERROR( + (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE), + mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) { + XFS_ERROR_REPORT("xfs_bmap_shift_extents", + XFS_ERRLEVEL_LOW, mp); + return -EFSCORRUPTED; + } + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + ASSERT(direction == SHIFT_LEFT || direction == SHIFT_RIGHT); + ASSERT(*next_fsb != NULLFSBLOCK || direction == SHIFT_RIGHT); + + ifp = XFS_IFORK_PTR(ip, whichfork); + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + /* Read in all the extents */ + error = xfs_iread_extents(tp, ip, whichfork); + if (error) + return error; + } + + if (ifp->if_flags & XFS_IFBROOT) { + cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); + cur->bc_private.b.firstblock = *firstblock; + cur->bc_private.b.flist = flist; + cur->bc_private.b.flags = 0; + } + + /* + * There may be delalloc extents in the data fork before the range we + * are collapsing out, so we cannot use the count of real extents here. + * Instead we have to calculate it from the incore fork. + */ + total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); + if (total_extents == 0) { + *done = 1; + goto del_cursor; + } + + /* + * In case of first right shift, we need to initialize next_fsb + */ + if (*next_fsb == NULLFSBLOCK) { + gotp = xfs_iext_get_ext(ifp, total_extents - 1); + xfs_bmbt_get_all(gotp, &got); + *next_fsb = got.br_startoff; + if (stop_fsb > *next_fsb) { + *done = 1; + goto del_cursor; + } + } + + /* Lookup the extent index at which we have to stop */ + if (direction == SHIFT_RIGHT) { + gotp = xfs_iext_bno_to_ext(ifp, stop_fsb, &stop_extent); + /* Make stop_extent exclusive of shift range */ + stop_extent--; + } else + stop_extent = total_extents; + + /* + * Look up the extent index for the fsb where we start shifting. We can + * henceforth iterate with current_ext as extent list changes are locked + * out via ilock. + * + * gotp can be null in 2 cases: 1) if there are no extents or 2) + * *next_fsb lies in a hole beyond which there are no extents. Either + * way, we are done. + */ + gotp = xfs_iext_bno_to_ext(ifp, *next_fsb, ¤t_ext); + if (!gotp) { + *done = 1; + goto del_cursor; + } + + /* some sanity checking before we finally start shifting extents */ + if ((direction == SHIFT_LEFT && current_ext >= stop_extent) || + (direction == SHIFT_RIGHT && current_ext <= stop_extent)) { + error = -EIO; + goto del_cursor; + } + + while (nexts++ < num_exts) { + error = xfs_bmse_shift_one(ip, whichfork, offset_shift_fsb, + ¤t_ext, gotp, cur, &logflags, + direction); + if (error) + goto del_cursor; + /* + * If there was an extent merge during the shift, the extent + * count can change. Update the total and grade the next record. + */ + if (direction == SHIFT_LEFT) { + total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); + stop_extent = total_extents; + } + + if (current_ext == stop_extent) { + *done = 1; + *next_fsb = NULLFSBLOCK; + break; + } + gotp = xfs_iext_get_ext(ifp, current_ext); + } + + if (!*done) { + xfs_bmbt_get_all(gotp, &got); + *next_fsb = got.br_startoff; + } + +del_cursor: + if (cur) + xfs_btree_del_cursor(cur, + error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); + + if (logflags) + xfs_trans_log_inode(tp, ip, logflags); + + return error; +} + +/* + * Splits an extent into two extents at split_fsb block such that it is + * the first block of the current_ext. @current_ext is a target extent + * to be split. @split_fsb is a block where the extents is split. + * If split_fsb lies in a hole or the first block of extents, just return 0. + */ +STATIC int +xfs_bmap_split_extent_at( + struct xfs_trans *tp, + struct xfs_inode *ip, + xfs_fileoff_t split_fsb, + xfs_fsblock_t *firstfsb, + struct xfs_bmap_free *free_list) +{ + int whichfork = XFS_DATA_FORK; + struct xfs_btree_cur *cur = NULL; + struct xfs_bmbt_rec_host *gotp; + struct xfs_bmbt_irec got; + struct xfs_bmbt_irec new; /* split extent */ + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp; + xfs_fsblock_t gotblkcnt; /* new block count for got */ + xfs_extnum_t current_ext; + int error = 0; + int logflags = 0; + int i = 0; + + if (unlikely(XFS_TEST_ERROR( + (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE), + mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) { + XFS_ERROR_REPORT("xfs_bmap_split_extent_at", + XFS_ERRLEVEL_LOW, mp); + return -EFSCORRUPTED; + } + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + ifp = XFS_IFORK_PTR(ip, whichfork); + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + /* Read in all the extents */ + error = xfs_iread_extents(tp, ip, whichfork); + if (error) + return error; + } + + /* + * gotp can be null in 2 cases: 1) if there are no extents + * or 2) split_fsb lies in a hole beyond which there are + * no extents. Either way, we are done. + */ + gotp = xfs_iext_bno_to_ext(ifp, split_fsb, ¤t_ext); + if (!gotp) + return 0; + + xfs_bmbt_get_all(gotp, &got); + + /* + * Check split_fsb lies in a hole or the start boundary offset + * of the extent. + */ + if (got.br_startoff >= split_fsb) + return 0; + + gotblkcnt = split_fsb - got.br_startoff; + new.br_startoff = split_fsb; + new.br_startblock = got.br_startblock + gotblkcnt; + new.br_blockcount = got.br_blockcount - gotblkcnt; + new.br_state = got.br_state; + + if (ifp->if_flags & XFS_IFBROOT) { + cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); + cur->bc_private.b.firstblock = *firstfsb; + cur->bc_private.b.flist = free_list; + cur->bc_private.b.flags = 0; + error = xfs_bmbt_lookup_eq(cur, got.br_startoff, + got.br_startblock, + got.br_blockcount, + &i); + if (error) + goto del_cursor; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, del_cursor); + } + + xfs_bmbt_set_blockcount(gotp, gotblkcnt); + got.br_blockcount = gotblkcnt; + + logflags = XFS_ILOG_CORE; + if (cur) { + error = xfs_bmbt_update(cur, got.br_startoff, + got.br_startblock, + got.br_blockcount, + got.br_state); + if (error) + goto del_cursor; + } else + logflags |= XFS_ILOG_DEXT; + + /* Add new extent */ + current_ext++; + xfs_iext_insert(ip, current_ext, 1, &new, 0); + XFS_IFORK_NEXT_SET(ip, whichfork, + XFS_IFORK_NEXTENTS(ip, whichfork) + 1); + + if (cur) { + error = xfs_bmbt_lookup_eq(cur, new.br_startoff, + new.br_startblock, new.br_blockcount, + &i); + if (error) + goto del_cursor; + XFS_WANT_CORRUPTED_GOTO(mp, i == 0, del_cursor); + cur->bc_rec.b.br_state = new.br_state; + + error = xfs_btree_insert(cur, &i); + if (error) + goto del_cursor; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, del_cursor); + } + + /* + * Convert to a btree if necessary. + */ + if (xfs_bmap_needs_btree(ip, whichfork)) { + int tmp_logflags; /* partial log flag return val */ + + ASSERT(cur == NULL); + error = xfs_bmap_extents_to_btree(tp, ip, firstfsb, free_list, + &cur, 0, &tmp_logflags, whichfork); + logflags |= tmp_logflags; + } + +del_cursor: + if (cur) { + cur->bc_private.b.allocated = 0; + xfs_btree_del_cursor(cur, + error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); + } + + if (logflags) + xfs_trans_log_inode(tp, ip, logflags); + return error; +} + +int +xfs_bmap_split_extent( + struct xfs_inode *ip, + xfs_fileoff_t split_fsb) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + struct xfs_bmap_free free_list; + xfs_fsblock_t firstfsb; + int committed; + int error; + + tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, + XFS_DIOSTRAT_SPACE_RES(mp, 0), 0); + if (error) { + xfs_trans_cancel(tp, 0); + return error; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + + xfs_bmap_init(&free_list, &firstfsb); + + error = xfs_bmap_split_extent_at(tp, ip, split_fsb, + &firstfsb, &free_list); + if (error) + goto out; + + error = xfs_bmap_finish(&tp, &free_list, &committed); + if (error) + goto out; + + return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + + +out: + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); + return error; +} diff --git a/kernel/fs/xfs/libxfs/xfs_bmap.h b/kernel/fs/xfs/libxfs/xfs_bmap.h new file mode 100644 index 000000000..6aaa0c1c7 --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_bmap.h @@ -0,0 +1,225 @@ +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_BMAP_H__ +#define __XFS_BMAP_H__ + +struct getbmap; +struct xfs_bmbt_irec; +struct xfs_ifork; +struct xfs_inode; +struct xfs_mount; +struct xfs_trans; + +extern kmem_zone_t *xfs_bmap_free_item_zone; + +/* + * Argument structure for xfs_bmap_alloc. + */ +struct xfs_bmalloca { + xfs_fsblock_t *firstblock; /* i/o first block allocated */ + struct xfs_bmap_free *flist; /* bmap freelist */ + struct xfs_trans *tp; /* transaction pointer */ + struct xfs_inode *ip; /* incore inode pointer */ + struct xfs_bmbt_irec prev; /* extent before the new one */ + struct xfs_bmbt_irec got; /* extent after, or delayed */ + + xfs_fileoff_t offset; /* offset in file filling in */ + xfs_extlen_t length; /* i/o length asked/allocated */ + xfs_fsblock_t blkno; /* starting block of new extent */ + + struct xfs_btree_cur *cur; /* btree cursor */ + xfs_extnum_t idx; /* current extent index */ + int nallocs;/* number of extents alloc'd */ + int logflags;/* flags for transaction logging */ + + xfs_extlen_t total; /* total blocks needed for xaction */ + xfs_extlen_t minlen; /* minimum allocation size (blocks) */ + xfs_extlen_t minleft; /* amount must be left after alloc */ + bool eof; /* set if allocating past last extent */ + bool wasdel; /* replacing a delayed allocation */ + bool userdata;/* set if is user data */ + bool aeof; /* allocated space at eof */ + bool conv; /* overwriting unwritten extents */ + int flags; +}; + +/* + * List of extents to be free "later". + * The list is kept sorted on xbf_startblock. + */ +typedef struct xfs_bmap_free_item +{ + xfs_fsblock_t xbfi_startblock;/* starting fs block number */ + xfs_extlen_t xbfi_blockcount;/* number of blocks in extent */ + struct xfs_bmap_free_item *xbfi_next; /* link to next entry */ +} xfs_bmap_free_item_t; + +/* + * Header for free extent list. + * + * xbf_low is used by the allocator to activate the lowspace algorithm - + * when free space is running low the extent allocator may choose to + * allocate an extent from an AG without leaving sufficient space for + * a btree split when inserting the new extent. In this case the allocator + * will enable the lowspace algorithm which is supposed to allow further + * allocations (such as btree splits and newroots) to allocate from + * sequential AGs. In order to avoid locking AGs out of order the lowspace + * algorithm will start searching for free space from AG 0. If the correct + * transaction reservations have been made then this algorithm will eventually + * find all the space it needs. + */ +typedef struct xfs_bmap_free +{ + xfs_bmap_free_item_t *xbf_first; /* list of to-be-free extents */ + int xbf_count; /* count of items on list */ + int xbf_low; /* alloc in low mode */ +} xfs_bmap_free_t; + +#define XFS_BMAP_MAX_NMAP 4 + +/* + * Flags for xfs_bmapi_* + */ +#define XFS_BMAPI_ENTIRE 0x001 /* return entire extent, not trimmed */ +#define XFS_BMAPI_METADATA 0x002 /* mapping metadata not user data */ +#define XFS_BMAPI_ATTRFORK 0x004 /* use attribute fork not data */ +#define XFS_BMAPI_PREALLOC 0x008 /* preallocation op: unwritten space */ +#define XFS_BMAPI_IGSTATE 0x010 /* Ignore state - */ + /* combine contig. space */ +#define XFS_BMAPI_CONTIG 0x020 /* must allocate only one extent */ +/* + * unwritten extent conversion - this needs write cache flushing and no additional + * allocation alignments. When specified with XFS_BMAPI_PREALLOC it converts + * from written to unwritten, otherwise convert from unwritten to written. + */ +#define XFS_BMAPI_CONVERT 0x040 + +#define XFS_BMAPI_FLAGS \ + { XFS_BMAPI_ENTIRE, "ENTIRE" }, \ + { XFS_BMAPI_METADATA, "METADATA" }, \ + { XFS_BMAPI_ATTRFORK, "ATTRFORK" }, \ + { XFS_BMAPI_PREALLOC, "PREALLOC" }, \ + { XFS_BMAPI_IGSTATE, "IGSTATE" }, \ + { XFS_BMAPI_CONTIG, "CONTIG" }, \ + { XFS_BMAPI_CONVERT, "CONVERT" } + + +static inline int xfs_bmapi_aflag(int w) +{ + return (w == XFS_ATTR_FORK ? XFS_BMAPI_ATTRFORK : 0); +} + +/* + * Special values for xfs_bmbt_irec_t br_startblock field. + */ +#define DELAYSTARTBLOCK ((xfs_fsblock_t)-1LL) +#define HOLESTARTBLOCK ((xfs_fsblock_t)-2LL) + +static inline void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp) +{ + ((flp)->xbf_first = NULL, (flp)->xbf_count = 0, \ + (flp)->xbf_low = 0, *(fbp) = NULLFSBLOCK); +} + +/* + * Flags for xfs_bmap_add_extent*. + */ +#define BMAP_LEFT_CONTIG (1 << 0) +#define BMAP_RIGHT_CONTIG (1 << 1) +#define BMAP_LEFT_FILLING (1 << 2) +#define BMAP_RIGHT_FILLING (1 << 3) +#define BMAP_LEFT_DELAY (1 << 4) +#define BMAP_RIGHT_DELAY (1 << 5) +#define BMAP_LEFT_VALID (1 << 6) +#define BMAP_RIGHT_VALID (1 << 7) +#define BMAP_ATTRFORK (1 << 8) + +#define XFS_BMAP_EXT_FLAGS \ + { BMAP_LEFT_CONTIG, "LC" }, \ + { BMAP_RIGHT_CONTIG, "RC" }, \ + { BMAP_LEFT_FILLING, "LF" }, \ + { BMAP_RIGHT_FILLING, "RF" }, \ + { BMAP_ATTRFORK, "ATTR" } + + +/* + * This macro is used to determine how many extents will be shifted + * in one write transaction. We could require two splits, + * an extent move on the first and an extent merge on the second, + * So it is proper that one extent is shifted inside write transaction + * at a time. + */ +#define XFS_BMAP_MAX_SHIFT_EXTENTS 1 + +enum shift_direction { + SHIFT_LEFT = 0, + SHIFT_RIGHT, +}; + +#ifdef DEBUG +void xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt, + int whichfork, unsigned long caller_ip); +#define XFS_BMAP_TRACE_EXLIST(ip,c,w) \ + xfs_bmap_trace_exlist(ip,c,w, _THIS_IP_) +#else +#define XFS_BMAP_TRACE_EXLIST(ip,c,w) +#endif + +int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd); +void xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork); +void xfs_bmap_add_free(xfs_fsblock_t bno, xfs_filblks_t len, + struct xfs_bmap_free *flist, struct xfs_mount *mp); +void xfs_bmap_cancel(struct xfs_bmap_free *flist); +int xfs_bmap_finish(struct xfs_trans **tp, struct xfs_bmap_free *flist, + int *committed); +void xfs_bmap_compute_maxlevels(struct xfs_mount *mp, int whichfork); +int xfs_bmap_first_unused(struct xfs_trans *tp, struct xfs_inode *ip, + xfs_extlen_t len, xfs_fileoff_t *unused, int whichfork); +int xfs_bmap_last_before(struct xfs_trans *tp, struct xfs_inode *ip, + xfs_fileoff_t *last_block, int whichfork); +int xfs_bmap_last_offset(struct xfs_inode *ip, xfs_fileoff_t *unused, + int whichfork); +int xfs_bmap_one_block(struct xfs_inode *ip, int whichfork); +int xfs_bmap_read_extents(struct xfs_trans *tp, struct xfs_inode *ip, + int whichfork); +int xfs_bmapi_read(struct xfs_inode *ip, xfs_fileoff_t bno, + xfs_filblks_t len, struct xfs_bmbt_irec *mval, + int *nmap, int flags); +int xfs_bmapi_delay(struct xfs_inode *ip, xfs_fileoff_t bno, + xfs_filblks_t len, struct xfs_bmbt_irec *mval, + int *nmap, int flags); +int xfs_bmapi_write(struct xfs_trans *tp, struct xfs_inode *ip, + xfs_fileoff_t bno, xfs_filblks_t len, int flags, + xfs_fsblock_t *firstblock, xfs_extlen_t total, + struct xfs_bmbt_irec *mval, int *nmap, + struct xfs_bmap_free *flist); +int xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip, + xfs_fileoff_t bno, xfs_filblks_t len, int flags, + xfs_extnum_t nexts, xfs_fsblock_t *firstblock, + struct xfs_bmap_free *flist, int *done); +int xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx, + xfs_extnum_t num); +uint xfs_default_attroffset(struct xfs_inode *ip); +int xfs_bmap_shift_extents(struct xfs_trans *tp, struct xfs_inode *ip, + xfs_fileoff_t *next_fsb, xfs_fileoff_t offset_shift_fsb, + int *done, xfs_fileoff_t stop_fsb, xfs_fsblock_t *firstblock, + struct xfs_bmap_free *flist, enum shift_direction direction, + int num_exts); +int xfs_bmap_split_extent(struct xfs_inode *ip, xfs_fileoff_t split_offset); + +#endif /* __XFS_BMAP_H__ */ diff --git a/kernel/fs/xfs/libxfs/xfs_bmap_btree.c b/kernel/fs/xfs/libxfs/xfs_bmap_btree.c new file mode 100644 index 000000000..2c44c8e50 --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_bmap_btree.c @@ -0,0 +1,883 @@ +/* + * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_inode_item.h" +#include "xfs_alloc.h" +#include "xfs_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_bmap.h" +#include "xfs_error.h" +#include "xfs_quota.h" +#include "xfs_trace.h" +#include "xfs_cksum.h" + +/* + * Determine the extent state. + */ +/* ARGSUSED */ +STATIC xfs_exntst_t +xfs_extent_state( + xfs_filblks_t blks, + int extent_flag) +{ + if (extent_flag) { + ASSERT(blks != 0); /* saved for DMIG */ + return XFS_EXT_UNWRITTEN; + } + return XFS_EXT_NORM; +} + +/* + * Convert on-disk form of btree root to in-memory form. + */ +void +xfs_bmdr_to_bmbt( + struct xfs_inode *ip, + xfs_bmdr_block_t *dblock, + int dblocklen, + struct xfs_btree_block *rblock, + int rblocklen) +{ + struct xfs_mount *mp = ip->i_mount; + int dmxr; + xfs_bmbt_key_t *fkp; + __be64 *fpp; + xfs_bmbt_key_t *tkp; + __be64 *tpp; + + if (xfs_sb_version_hascrc(&mp->m_sb)) + xfs_btree_init_block_int(mp, rblock, XFS_BUF_DADDR_NULL, + XFS_BMAP_CRC_MAGIC, 0, 0, ip->i_ino, + XFS_BTREE_LONG_PTRS | XFS_BTREE_CRC_BLOCKS); + else + xfs_btree_init_block_int(mp, rblock, XFS_BUF_DADDR_NULL, + XFS_BMAP_MAGIC, 0, 0, ip->i_ino, + XFS_BTREE_LONG_PTRS); + + rblock->bb_level = dblock->bb_level; + ASSERT(be16_to_cpu(rblock->bb_level) > 0); + rblock->bb_numrecs = dblock->bb_numrecs; + dmxr = xfs_bmdr_maxrecs(dblocklen, 0); + fkp = XFS_BMDR_KEY_ADDR(dblock, 1); + tkp = XFS_BMBT_KEY_ADDR(mp, rblock, 1); + fpp = XFS_BMDR_PTR_ADDR(dblock, 1, dmxr); + tpp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, rblocklen); + dmxr = be16_to_cpu(dblock->bb_numrecs); + memcpy(tkp, fkp, sizeof(*fkp) * dmxr); + memcpy(tpp, fpp, sizeof(*fpp) * dmxr); +} + +/* + * Convert a compressed bmap extent record to an uncompressed form. + * This code must be in sync with the routines xfs_bmbt_get_startoff, + * xfs_bmbt_get_startblock, xfs_bmbt_get_blockcount and xfs_bmbt_get_state. + */ +STATIC void +__xfs_bmbt_get_all( + __uint64_t l0, + __uint64_t l1, + xfs_bmbt_irec_t *s) +{ + int ext_flag; + xfs_exntst_t st; + + ext_flag = (int)(l0 >> (64 - BMBT_EXNTFLAG_BITLEN)); + s->br_startoff = ((xfs_fileoff_t)l0 & + xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9; + s->br_startblock = (((xfs_fsblock_t)l0 & xfs_mask64lo(9)) << 43) | + (((xfs_fsblock_t)l1) >> 21); + s->br_blockcount = (xfs_filblks_t)(l1 & xfs_mask64lo(21)); + /* This is xfs_extent_state() in-line */ + if (ext_flag) { + ASSERT(s->br_blockcount != 0); /* saved for DMIG */ + st = XFS_EXT_UNWRITTEN; + } else + st = XFS_EXT_NORM; + s->br_state = st; +} + +void +xfs_bmbt_get_all( + xfs_bmbt_rec_host_t *r, + xfs_bmbt_irec_t *s) +{ + __xfs_bmbt_get_all(r->l0, r->l1, s); +} + +/* + * Extract the blockcount field from an in memory bmap extent record. + */ +xfs_filblks_t +xfs_bmbt_get_blockcount( + xfs_bmbt_rec_host_t *r) +{ + return (xfs_filblks_t)(r->l1 & xfs_mask64lo(21)); +} + +/* + * Extract the startblock field from an in memory bmap extent record. + */ +xfs_fsblock_t +xfs_bmbt_get_startblock( + xfs_bmbt_rec_host_t *r) +{ + return (((xfs_fsblock_t)r->l0 & xfs_mask64lo(9)) << 43) | + (((xfs_fsblock_t)r->l1) >> 21); +} + +/* + * Extract the startoff field from an in memory bmap extent record. + */ +xfs_fileoff_t +xfs_bmbt_get_startoff( + xfs_bmbt_rec_host_t *r) +{ + return ((xfs_fileoff_t)r->l0 & + xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9; +} + +xfs_exntst_t +xfs_bmbt_get_state( + xfs_bmbt_rec_host_t *r) +{ + int ext_flag; + + ext_flag = (int)((r->l0) >> (64 - BMBT_EXNTFLAG_BITLEN)); + return xfs_extent_state(xfs_bmbt_get_blockcount(r), + ext_flag); +} + +/* + * Extract the blockcount field from an on disk bmap extent record. + */ +xfs_filblks_t +xfs_bmbt_disk_get_blockcount( + xfs_bmbt_rec_t *r) +{ + return (xfs_filblks_t)(be64_to_cpu(r->l1) & xfs_mask64lo(21)); +} + +/* + * Extract the startoff field from a disk format bmap extent record. + */ +xfs_fileoff_t +xfs_bmbt_disk_get_startoff( + xfs_bmbt_rec_t *r) +{ + return ((xfs_fileoff_t)be64_to_cpu(r->l0) & + xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9; +} + + +/* + * Set all the fields in a bmap extent record from the arguments. + */ +void +xfs_bmbt_set_allf( + xfs_bmbt_rec_host_t *r, + xfs_fileoff_t startoff, + xfs_fsblock_t startblock, + xfs_filblks_t blockcount, + xfs_exntst_t state) +{ + int extent_flag = (state == XFS_EXT_NORM) ? 0 : 1; + + ASSERT(state == XFS_EXT_NORM || state == XFS_EXT_UNWRITTEN); + ASSERT((startoff & xfs_mask64hi(64-BMBT_STARTOFF_BITLEN)) == 0); + ASSERT((blockcount & xfs_mask64hi(64-BMBT_BLOCKCOUNT_BITLEN)) == 0); + + ASSERT((startblock & xfs_mask64hi(64-BMBT_STARTBLOCK_BITLEN)) == 0); + + r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) | + ((xfs_bmbt_rec_base_t)startoff << 9) | + ((xfs_bmbt_rec_base_t)startblock >> 43); + r->l1 = ((xfs_bmbt_rec_base_t)startblock << 21) | + ((xfs_bmbt_rec_base_t)blockcount & + (xfs_bmbt_rec_base_t)xfs_mask64lo(21)); +} + +/* + * Set all the fields in a bmap extent record from the uncompressed form. + */ +void +xfs_bmbt_set_all( + xfs_bmbt_rec_host_t *r, + xfs_bmbt_irec_t *s) +{ + xfs_bmbt_set_allf(r, s->br_startoff, s->br_startblock, + s->br_blockcount, s->br_state); +} + + +/* + * Set all the fields in a disk format bmap extent record from the arguments. + */ +void +xfs_bmbt_disk_set_allf( + xfs_bmbt_rec_t *r, + xfs_fileoff_t startoff, + xfs_fsblock_t startblock, + xfs_filblks_t blockcount, + xfs_exntst_t state) +{ + int extent_flag = (state == XFS_EXT_NORM) ? 0 : 1; + + ASSERT(state == XFS_EXT_NORM || state == XFS_EXT_UNWRITTEN); + ASSERT((startoff & xfs_mask64hi(64-BMBT_STARTOFF_BITLEN)) == 0); + ASSERT((blockcount & xfs_mask64hi(64-BMBT_BLOCKCOUNT_BITLEN)) == 0); + ASSERT((startblock & xfs_mask64hi(64-BMBT_STARTBLOCK_BITLEN)) == 0); + + r->l0 = cpu_to_be64( + ((xfs_bmbt_rec_base_t)extent_flag << 63) | + ((xfs_bmbt_rec_base_t)startoff << 9) | + ((xfs_bmbt_rec_base_t)startblock >> 43)); + r->l1 = cpu_to_be64( + ((xfs_bmbt_rec_base_t)startblock << 21) | + ((xfs_bmbt_rec_base_t)blockcount & + (xfs_bmbt_rec_base_t)xfs_mask64lo(21))); +} + +/* + * Set all the fields in a bmap extent record from the uncompressed form. + */ +STATIC void +xfs_bmbt_disk_set_all( + xfs_bmbt_rec_t *r, + xfs_bmbt_irec_t *s) +{ + xfs_bmbt_disk_set_allf(r, s->br_startoff, s->br_startblock, + s->br_blockcount, s->br_state); +} + +/* + * Set the blockcount field in a bmap extent record. + */ +void +xfs_bmbt_set_blockcount( + xfs_bmbt_rec_host_t *r, + xfs_filblks_t v) +{ + ASSERT((v & xfs_mask64hi(43)) == 0); + r->l1 = (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64hi(43)) | + (xfs_bmbt_rec_base_t)(v & xfs_mask64lo(21)); +} + +/* + * Set the startblock field in a bmap extent record. + */ +void +xfs_bmbt_set_startblock( + xfs_bmbt_rec_host_t *r, + xfs_fsblock_t v) +{ + ASSERT((v & xfs_mask64hi(12)) == 0); + r->l0 = (r->l0 & (xfs_bmbt_rec_base_t)xfs_mask64hi(55)) | + (xfs_bmbt_rec_base_t)(v >> 43); + r->l1 = (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64lo(21)) | + (xfs_bmbt_rec_base_t)(v << 21); +} + +/* + * Set the startoff field in a bmap extent record. + */ +void +xfs_bmbt_set_startoff( + xfs_bmbt_rec_host_t *r, + xfs_fileoff_t v) +{ + ASSERT((v & xfs_mask64hi(9)) == 0); + r->l0 = (r->l0 & (xfs_bmbt_rec_base_t) xfs_mask64hi(1)) | + ((xfs_bmbt_rec_base_t)v << 9) | + (r->l0 & (xfs_bmbt_rec_base_t)xfs_mask64lo(9)); +} + +/* + * Set the extent state field in a bmap extent record. + */ +void +xfs_bmbt_set_state( + xfs_bmbt_rec_host_t *r, + xfs_exntst_t v) +{ + ASSERT(v == XFS_EXT_NORM || v == XFS_EXT_UNWRITTEN); + if (v == XFS_EXT_NORM) + r->l0 &= xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN); + else + r->l0 |= xfs_mask64hi(BMBT_EXNTFLAG_BITLEN); +} + +/* + * Convert in-memory form of btree root to on-disk form. + */ +void +xfs_bmbt_to_bmdr( + struct xfs_mount *mp, + struct xfs_btree_block *rblock, + int rblocklen, + xfs_bmdr_block_t *dblock, + int dblocklen) +{ + int dmxr; + xfs_bmbt_key_t *fkp; + __be64 *fpp; + xfs_bmbt_key_t *tkp; + __be64 *tpp; + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + ASSERT(rblock->bb_magic == cpu_to_be32(XFS_BMAP_CRC_MAGIC)); + ASSERT(uuid_equal(&rblock->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid)); + ASSERT(rblock->bb_u.l.bb_blkno == + cpu_to_be64(XFS_BUF_DADDR_NULL)); + } else + ASSERT(rblock->bb_magic == cpu_to_be32(XFS_BMAP_MAGIC)); + ASSERT(rblock->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK)); + ASSERT(rblock->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK)); + ASSERT(rblock->bb_level != 0); + dblock->bb_level = rblock->bb_level; + dblock->bb_numrecs = rblock->bb_numrecs; + dmxr = xfs_bmdr_maxrecs(dblocklen, 0); + fkp = XFS_BMBT_KEY_ADDR(mp, rblock, 1); + tkp = XFS_BMDR_KEY_ADDR(dblock, 1); + fpp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, rblocklen); + tpp = XFS_BMDR_PTR_ADDR(dblock, 1, dmxr); + dmxr = be16_to_cpu(dblock->bb_numrecs); + memcpy(tkp, fkp, sizeof(*fkp) * dmxr); + memcpy(tpp, fpp, sizeof(*fpp) * dmxr); +} + +/* + * Check extent records, which have just been read, for + * any bit in the extent flag field. ASSERT on debug + * kernels, as this condition should not occur. + * Return an error condition (1) if any flags found, + * otherwise return 0. + */ + +int +xfs_check_nostate_extents( + xfs_ifork_t *ifp, + xfs_extnum_t idx, + xfs_extnum_t num) +{ + for (; num > 0; num--, idx++) { + xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, idx); + if ((ep->l0 >> + (64 - BMBT_EXNTFLAG_BITLEN)) != 0) { + ASSERT(0); + return 1; + } + } + return 0; +} + + +STATIC struct xfs_btree_cur * +xfs_bmbt_dup_cursor( + struct xfs_btree_cur *cur) +{ + struct xfs_btree_cur *new; + + new = xfs_bmbt_init_cursor(cur->bc_mp, cur->bc_tp, + cur->bc_private.b.ip, cur->bc_private.b.whichfork); + + /* + * Copy the firstblock, flist, and flags values, + * since init cursor doesn't get them. + */ + new->bc_private.b.firstblock = cur->bc_private.b.firstblock; + new->bc_private.b.flist = cur->bc_private.b.flist; + new->bc_private.b.flags = cur->bc_private.b.flags; + + return new; +} + +STATIC void +xfs_bmbt_update_cursor( + struct xfs_btree_cur *src, + struct xfs_btree_cur *dst) +{ + ASSERT((dst->bc_private.b.firstblock != NULLFSBLOCK) || + (dst->bc_private.b.ip->i_d.di_flags & XFS_DIFLAG_REALTIME)); + ASSERT(dst->bc_private.b.flist == src->bc_private.b.flist); + + dst->bc_private.b.allocated += src->bc_private.b.allocated; + dst->bc_private.b.firstblock = src->bc_private.b.firstblock; + + src->bc_private.b.allocated = 0; +} + +STATIC int +xfs_bmbt_alloc_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *start, + union xfs_btree_ptr *new, + int *stat) +{ + xfs_alloc_arg_t args; /* block allocation args */ + int error; /* error return value */ + + memset(&args, 0, sizeof(args)); + args.tp = cur->bc_tp; + args.mp = cur->bc_mp; + args.fsbno = cur->bc_private.b.firstblock; + args.firstblock = args.fsbno; + + if (args.fsbno == NULLFSBLOCK) { + args.fsbno = be64_to_cpu(start->l); + args.type = XFS_ALLOCTYPE_START_BNO; + /* + * Make sure there is sufficient room left in the AG to + * complete a full tree split for an extent insert. If + * we are converting the middle part of an extent then + * we may need space for two tree splits. + * + * We are relying on the caller to make the correct block + * reservation for this operation to succeed. If the + * reservation amount is insufficient then we may fail a + * block allocation here and corrupt the filesystem. + */ + args.minleft = xfs_trans_get_block_res(args.tp); + } else if (cur->bc_private.b.flist->xbf_low) { + args.type = XFS_ALLOCTYPE_START_BNO; + } else { + args.type = XFS_ALLOCTYPE_NEAR_BNO; + } + + args.minlen = args.maxlen = args.prod = 1; + args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL; + if (!args.wasdel && xfs_trans_get_block_res(args.tp) == 0) { + error = -ENOSPC; + goto error0; + } + error = xfs_alloc_vextent(&args); + if (error) + goto error0; + + if (args.fsbno == NULLFSBLOCK && args.minleft) { + /* + * Could not find an AG with enough free space to satisfy + * a full btree split. Try again without minleft and if + * successful activate the lowspace algorithm. + */ + args.fsbno = 0; + args.type = XFS_ALLOCTYPE_FIRST_AG; + args.minleft = 0; + error = xfs_alloc_vextent(&args); + if (error) + goto error0; + cur->bc_private.b.flist->xbf_low = 1; + } + if (args.fsbno == NULLFSBLOCK) { + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 0; + return 0; + } + ASSERT(args.len == 1); + cur->bc_private.b.firstblock = args.fsbno; + cur->bc_private.b.allocated++; + cur->bc_private.b.ip->i_d.di_nblocks++; + xfs_trans_log_inode(args.tp, cur->bc_private.b.ip, XFS_ILOG_CORE); + xfs_trans_mod_dquot_byino(args.tp, cur->bc_private.b.ip, + XFS_TRANS_DQ_BCOUNT, 1L); + + new->l = cpu_to_be64(args.fsbno); + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 1; + return 0; + + error0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; +} + +STATIC int +xfs_bmbt_free_block( + struct xfs_btree_cur *cur, + struct xfs_buf *bp) +{ + struct xfs_mount *mp = cur->bc_mp; + struct xfs_inode *ip = cur->bc_private.b.ip; + struct xfs_trans *tp = cur->bc_tp; + xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp)); + + xfs_bmap_add_free(fsbno, 1, cur->bc_private.b.flist, mp); + ip->i_d.di_nblocks--; + + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L); + xfs_trans_binval(tp, bp); + return 0; +} + +STATIC int +xfs_bmbt_get_minrecs( + struct xfs_btree_cur *cur, + int level) +{ + if (level == cur->bc_nlevels - 1) { + struct xfs_ifork *ifp; + + ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, + cur->bc_private.b.whichfork); + + return xfs_bmbt_maxrecs(cur->bc_mp, + ifp->if_broot_bytes, level == 0) / 2; + } + + return cur->bc_mp->m_bmap_dmnr[level != 0]; +} + +int +xfs_bmbt_get_maxrecs( + struct xfs_btree_cur *cur, + int level) +{ + if (level == cur->bc_nlevels - 1) { + struct xfs_ifork *ifp; + + ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, + cur->bc_private.b.whichfork); + + return xfs_bmbt_maxrecs(cur->bc_mp, + ifp->if_broot_bytes, level == 0); + } + + return cur->bc_mp->m_bmap_dmxr[level != 0]; + +} + +/* + * Get the maximum records we could store in the on-disk format. + * + * For non-root nodes this is equivalent to xfs_bmbt_get_maxrecs, but + * for the root node this checks the available space in the dinode fork + * so that we can resize the in-memory buffer to match it. After a + * resize to the maximum size this function returns the same value + * as xfs_bmbt_get_maxrecs for the root node, too. + */ +STATIC int +xfs_bmbt_get_dmaxrecs( + struct xfs_btree_cur *cur, + int level) +{ + if (level != cur->bc_nlevels - 1) + return cur->bc_mp->m_bmap_dmxr[level != 0]; + return xfs_bmdr_maxrecs(cur->bc_private.b.forksize, level == 0); +} + +STATIC void +xfs_bmbt_init_key_from_rec( + union xfs_btree_key *key, + union xfs_btree_rec *rec) +{ + key->bmbt.br_startoff = + cpu_to_be64(xfs_bmbt_disk_get_startoff(&rec->bmbt)); +} + +STATIC void +xfs_bmbt_init_rec_from_key( + union xfs_btree_key *key, + union xfs_btree_rec *rec) +{ + ASSERT(key->bmbt.br_startoff != 0); + + xfs_bmbt_disk_set_allf(&rec->bmbt, be64_to_cpu(key->bmbt.br_startoff), + 0, 0, XFS_EXT_NORM); +} + +STATIC void +xfs_bmbt_init_rec_from_cur( + struct xfs_btree_cur *cur, + union xfs_btree_rec *rec) +{ + xfs_bmbt_disk_set_all(&rec->bmbt, &cur->bc_rec.b); +} + +STATIC void +xfs_bmbt_init_ptr_from_cur( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr) +{ + ptr->l = 0; +} + +STATIC __int64_t +xfs_bmbt_key_diff( + struct xfs_btree_cur *cur, + union xfs_btree_key *key) +{ + return (__int64_t)be64_to_cpu(key->bmbt.br_startoff) - + cur->bc_rec.b.br_startoff; +} + +static bool +xfs_bmbt_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + unsigned int level; + + switch (block->bb_magic) { + case cpu_to_be32(XFS_BMAP_CRC_MAGIC): + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return false; + if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid)) + return false; + if (be64_to_cpu(block->bb_u.l.bb_blkno) != bp->b_bn) + return false; + /* + * XXX: need a better way of verifying the owner here. Right now + * just make sure there has been one set. + */ + if (be64_to_cpu(block->bb_u.l.bb_owner) == 0) + return false; + /* fall through */ + case cpu_to_be32(XFS_BMAP_MAGIC): + break; + default: + return false; + } + + /* + * numrecs and level verification. + * + * We don't know what fork we belong to, so just verify that the level + * is less than the maximum of the two. Later checks will be more + * precise. + */ + level = be16_to_cpu(block->bb_level); + if (level > max(mp->m_bm_maxlevels[0], mp->m_bm_maxlevels[1])) + return false; + if (be16_to_cpu(block->bb_numrecs) > mp->m_bmap_dmxr[level != 0]) + return false; + + /* sibling pointer verification */ + if (!block->bb_u.l.bb_leftsib || + (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK) && + !XFS_FSB_SANITY_CHECK(mp, be64_to_cpu(block->bb_u.l.bb_leftsib)))) + return false; + if (!block->bb_u.l.bb_rightsib || + (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK) && + !XFS_FSB_SANITY_CHECK(mp, be64_to_cpu(block->bb_u.l.bb_rightsib)))) + return false; + + return true; +} + +static void +xfs_bmbt_read_verify( + struct xfs_buf *bp) +{ + if (!xfs_btree_lblock_verify_crc(bp)) + xfs_buf_ioerror(bp, -EFSBADCRC); + else if (!xfs_bmbt_verify(bp)) + xfs_buf_ioerror(bp, -EFSCORRUPTED); + + if (bp->b_error) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + xfs_verifier_error(bp); + } +} + +static void +xfs_bmbt_write_verify( + struct xfs_buf *bp) +{ + if (!xfs_bmbt_verify(bp)) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + xfs_buf_ioerror(bp, -EFSCORRUPTED); + xfs_verifier_error(bp); + return; + } + xfs_btree_lblock_calc_crc(bp); +} + +const struct xfs_buf_ops xfs_bmbt_buf_ops = { + .verify_read = xfs_bmbt_read_verify, + .verify_write = xfs_bmbt_write_verify, +}; + + +#if defined(DEBUG) || defined(XFS_WARN) +STATIC int +xfs_bmbt_keys_inorder( + struct xfs_btree_cur *cur, + union xfs_btree_key *k1, + union xfs_btree_key *k2) +{ + return be64_to_cpu(k1->bmbt.br_startoff) < + be64_to_cpu(k2->bmbt.br_startoff); +} + +STATIC int +xfs_bmbt_recs_inorder( + struct xfs_btree_cur *cur, + union xfs_btree_rec *r1, + union xfs_btree_rec *r2) +{ + return xfs_bmbt_disk_get_startoff(&r1->bmbt) + + xfs_bmbt_disk_get_blockcount(&r1->bmbt) <= + xfs_bmbt_disk_get_startoff(&r2->bmbt); +} +#endif /* DEBUG */ + +static const struct xfs_btree_ops xfs_bmbt_ops = { + .rec_len = sizeof(xfs_bmbt_rec_t), + .key_len = sizeof(xfs_bmbt_key_t), + + .dup_cursor = xfs_bmbt_dup_cursor, + .update_cursor = xfs_bmbt_update_cursor, + .alloc_block = xfs_bmbt_alloc_block, + .free_block = xfs_bmbt_free_block, + .get_maxrecs = xfs_bmbt_get_maxrecs, + .get_minrecs = xfs_bmbt_get_minrecs, + .get_dmaxrecs = xfs_bmbt_get_dmaxrecs, + .init_key_from_rec = xfs_bmbt_init_key_from_rec, + .init_rec_from_key = xfs_bmbt_init_rec_from_key, + .init_rec_from_cur = xfs_bmbt_init_rec_from_cur, + .init_ptr_from_cur = xfs_bmbt_init_ptr_from_cur, + .key_diff = xfs_bmbt_key_diff, + .buf_ops = &xfs_bmbt_buf_ops, +#if defined(DEBUG) || defined(XFS_WARN) + .keys_inorder = xfs_bmbt_keys_inorder, + .recs_inorder = xfs_bmbt_recs_inorder, +#endif +}; + +/* + * Allocate a new bmap btree cursor. + */ +struct xfs_btree_cur * /* new bmap btree cursor */ +xfs_bmbt_init_cursor( + struct xfs_mount *mp, /* file system mount point */ + struct xfs_trans *tp, /* transaction pointer */ + struct xfs_inode *ip, /* inode owning the btree */ + int whichfork) /* data or attr fork */ +{ + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_btree_cur *cur; + + cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP); + + cur->bc_tp = tp; + cur->bc_mp = mp; + cur->bc_nlevels = be16_to_cpu(ifp->if_broot->bb_level) + 1; + cur->bc_btnum = XFS_BTNUM_BMAP; + cur->bc_blocklog = mp->m_sb.sb_blocklog; + + cur->bc_ops = &xfs_bmbt_ops; + cur->bc_flags = XFS_BTREE_LONG_PTRS | XFS_BTREE_ROOT_IN_INODE; + if (xfs_sb_version_hascrc(&mp->m_sb)) + cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; + + cur->bc_private.b.forksize = XFS_IFORK_SIZE(ip, whichfork); + cur->bc_private.b.ip = ip; + cur->bc_private.b.firstblock = NULLFSBLOCK; + cur->bc_private.b.flist = NULL; + cur->bc_private.b.allocated = 0; + cur->bc_private.b.flags = 0; + cur->bc_private.b.whichfork = whichfork; + + return cur; +} + +/* + * Calculate number of records in a bmap btree block. + */ +int +xfs_bmbt_maxrecs( + struct xfs_mount *mp, + int blocklen, + int leaf) +{ + blocklen -= XFS_BMBT_BLOCK_LEN(mp); + + if (leaf) + return blocklen / sizeof(xfs_bmbt_rec_t); + return blocklen / (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t)); +} + +/* + * Calculate number of records in a bmap btree inode root. + */ +int +xfs_bmdr_maxrecs( + int blocklen, + int leaf) +{ + blocklen -= sizeof(xfs_bmdr_block_t); + + if (leaf) + return blocklen / sizeof(xfs_bmdr_rec_t); + return blocklen / (sizeof(xfs_bmdr_key_t) + sizeof(xfs_bmdr_ptr_t)); +} + +/* + * Change the owner of a btree format fork fo the inode passed in. Change it to + * the owner of that is passed in so that we can change owners before or after + * we switch forks between inodes. The operation that the caller is doing will + * determine whether is needs to change owner before or after the switch. + * + * For demand paged transactional modification, the fork switch should be done + * after reading in all the blocks, modifying them and pinning them in the + * transaction. For modification when the buffers are already pinned in memory, + * the fork switch can be done before changing the owner as we won't need to + * validate the owner until the btree buffers are unpinned and writes can occur + * again. + * + * For recovery based ownership change, there is no transactional context and + * so a buffer list must be supplied so that we can record the buffers that we + * modified for the caller to issue IO on. + */ +int +xfs_bmbt_change_owner( + struct xfs_trans *tp, + struct xfs_inode *ip, + int whichfork, + xfs_ino_t new_owner, + struct list_head *buffer_list) +{ + struct xfs_btree_cur *cur; + int error; + + ASSERT(tp || buffer_list); + ASSERT(!(tp && buffer_list)); + if (whichfork == XFS_DATA_FORK) + ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_BTREE); + else + ASSERT(ip->i_d.di_aformat == XFS_DINODE_FMT_BTREE); + + cur = xfs_bmbt_init_cursor(ip->i_mount, tp, ip, whichfork); + if (!cur) + return -ENOMEM; + + error = xfs_btree_change_owner(cur, new_owner, buffer_list); + xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); + return error; +} diff --git a/kernel/fs/xfs/libxfs/xfs_bmap_btree.h b/kernel/fs/xfs/libxfs/xfs_bmap_btree.h new file mode 100644 index 000000000..819a8a4de --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_bmap_btree.h @@ -0,0 +1,143 @@ +/* + * Copyright (c) 2000,2002-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_BMAP_BTREE_H__ +#define __XFS_BMAP_BTREE_H__ + +struct xfs_btree_cur; +struct xfs_btree_block; +struct xfs_mount; +struct xfs_inode; +struct xfs_trans; + +/* + * Extent state and extent format macros. + */ +#define XFS_EXTFMT_INODE(x) \ + (xfs_sb_version_hasextflgbit(&((x)->i_mount->m_sb)) ? \ + XFS_EXTFMT_HASSTATE : XFS_EXTFMT_NOSTATE) +#define ISUNWRITTEN(x) ((x)->br_state == XFS_EXT_UNWRITTEN) + +/* + * Btree block header size depends on a superblock flag. + */ +#define XFS_BMBT_BLOCK_LEN(mp) \ + (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \ + XFS_BTREE_LBLOCK_CRC_LEN : XFS_BTREE_LBLOCK_LEN) + +#define XFS_BMBT_REC_ADDR(mp, block, index) \ + ((xfs_bmbt_rec_t *) \ + ((char *)(block) + \ + XFS_BMBT_BLOCK_LEN(mp) + \ + ((index) - 1) * sizeof(xfs_bmbt_rec_t))) + +#define XFS_BMBT_KEY_ADDR(mp, block, index) \ + ((xfs_bmbt_key_t *) \ + ((char *)(block) + \ + XFS_BMBT_BLOCK_LEN(mp) + \ + ((index) - 1) * sizeof(xfs_bmbt_key_t))) + +#define XFS_BMBT_PTR_ADDR(mp, block, index, maxrecs) \ + ((xfs_bmbt_ptr_t *) \ + ((char *)(block) + \ + XFS_BMBT_BLOCK_LEN(mp) + \ + (maxrecs) * sizeof(xfs_bmbt_key_t) + \ + ((index) - 1) * sizeof(xfs_bmbt_ptr_t))) + +#define XFS_BMDR_REC_ADDR(block, index) \ + ((xfs_bmdr_rec_t *) \ + ((char *)(block) + \ + sizeof(struct xfs_bmdr_block) + \ + ((index) - 1) * sizeof(xfs_bmdr_rec_t))) + +#define XFS_BMDR_KEY_ADDR(block, index) \ + ((xfs_bmdr_key_t *) \ + ((char *)(block) + \ + sizeof(struct xfs_bmdr_block) + \ + ((index) - 1) * sizeof(xfs_bmdr_key_t))) + +#define XFS_BMDR_PTR_ADDR(block, index, maxrecs) \ + ((xfs_bmdr_ptr_t *) \ + ((char *)(block) + \ + sizeof(struct xfs_bmdr_block) + \ + (maxrecs) * sizeof(xfs_bmdr_key_t) + \ + ((index) - 1) * sizeof(xfs_bmdr_ptr_t))) + +/* + * These are to be used when we know the size of the block and + * we don't have a cursor. + */ +#define XFS_BMAP_BROOT_PTR_ADDR(mp, bb, i, sz) \ + XFS_BMBT_PTR_ADDR(mp, bb, i, xfs_bmbt_maxrecs(mp, sz, 0)) + +#define XFS_BMAP_BROOT_SPACE_CALC(mp, nrecs) \ + (int)(XFS_BMBT_BLOCK_LEN(mp) + \ + ((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t)))) + +#define XFS_BMAP_BROOT_SPACE(mp, bb) \ + (XFS_BMAP_BROOT_SPACE_CALC(mp, be16_to_cpu((bb)->bb_numrecs))) +#define XFS_BMDR_SPACE_CALC(nrecs) \ + (int)(sizeof(xfs_bmdr_block_t) + \ + ((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t)))) +#define XFS_BMAP_BMDR_SPACE(bb) \ + (XFS_BMDR_SPACE_CALC(be16_to_cpu((bb)->bb_numrecs))) + +/* + * Maximum number of bmap btree levels. + */ +#define XFS_BM_MAXLEVELS(mp,w) ((mp)->m_bm_maxlevels[(w)]) + +/* + * Prototypes for xfs_bmap.c to call. + */ +extern void xfs_bmdr_to_bmbt(struct xfs_inode *, xfs_bmdr_block_t *, int, + struct xfs_btree_block *, int); +extern void xfs_bmbt_get_all(xfs_bmbt_rec_host_t *r, xfs_bmbt_irec_t *s); +extern xfs_filblks_t xfs_bmbt_get_blockcount(xfs_bmbt_rec_host_t *r); +extern xfs_fsblock_t xfs_bmbt_get_startblock(xfs_bmbt_rec_host_t *r); +extern xfs_fileoff_t xfs_bmbt_get_startoff(xfs_bmbt_rec_host_t *r); +extern xfs_exntst_t xfs_bmbt_get_state(xfs_bmbt_rec_host_t *r); + +extern xfs_filblks_t xfs_bmbt_disk_get_blockcount(xfs_bmbt_rec_t *r); +extern xfs_fileoff_t xfs_bmbt_disk_get_startoff(xfs_bmbt_rec_t *r); + +extern void xfs_bmbt_set_all(xfs_bmbt_rec_host_t *r, xfs_bmbt_irec_t *s); +extern void xfs_bmbt_set_allf(xfs_bmbt_rec_host_t *r, xfs_fileoff_t o, + xfs_fsblock_t b, xfs_filblks_t c, xfs_exntst_t v); +extern void xfs_bmbt_set_blockcount(xfs_bmbt_rec_host_t *r, xfs_filblks_t v); +extern void xfs_bmbt_set_startblock(xfs_bmbt_rec_host_t *r, xfs_fsblock_t v); +extern void xfs_bmbt_set_startoff(xfs_bmbt_rec_host_t *r, xfs_fileoff_t v); +extern void xfs_bmbt_set_state(xfs_bmbt_rec_host_t *r, xfs_exntst_t v); + +extern void xfs_bmbt_disk_set_allf(xfs_bmbt_rec_t *r, xfs_fileoff_t o, + xfs_fsblock_t b, xfs_filblks_t c, xfs_exntst_t v); + +extern void xfs_bmbt_to_bmdr(struct xfs_mount *, struct xfs_btree_block *, int, + xfs_bmdr_block_t *, int); + +extern int xfs_bmbt_get_maxrecs(struct xfs_btree_cur *, int level); +extern int xfs_bmdr_maxrecs(int blocklen, int leaf); +extern int xfs_bmbt_maxrecs(struct xfs_mount *, int blocklen, int leaf); + +extern int xfs_bmbt_change_owner(struct xfs_trans *tp, struct xfs_inode *ip, + int whichfork, xfs_ino_t new_owner, + struct list_head *buffer_list); + +extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *, + struct xfs_trans *, struct xfs_inode *, int); + +#endif /* __XFS_BMAP_BTREE_H__ */ diff --git a/kernel/fs/xfs/libxfs/xfs_btree.c b/kernel/fs/xfs/libxfs/xfs_btree.c new file mode 100644 index 000000000..c72283dd8 --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_btree.c @@ -0,0 +1,4067 @@ +/* + * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_inode_item.h" +#include "xfs_buf_item.h" +#include "xfs_btree.h" +#include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_cksum.h" +#include "xfs_alloc.h" + +/* + * Cursor allocation zone. + */ +kmem_zone_t *xfs_btree_cur_zone; + +/* + * Btree magic numbers. + */ +static const __uint32_t xfs_magics[2][XFS_BTNUM_MAX] = { + { XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, XFS_BMAP_MAGIC, XFS_IBT_MAGIC, + XFS_FIBT_MAGIC }, + { XFS_ABTB_CRC_MAGIC, XFS_ABTC_CRC_MAGIC, + XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC, XFS_FIBT_CRC_MAGIC } +}; +#define xfs_btree_magic(cur) \ + xfs_magics[!!((cur)->bc_flags & XFS_BTREE_CRC_BLOCKS)][cur->bc_btnum] + + +STATIC int /* error (0 or EFSCORRUPTED) */ +xfs_btree_check_lblock( + struct xfs_btree_cur *cur, /* btree cursor */ + struct xfs_btree_block *block, /* btree long form block pointer */ + int level, /* level of the btree block */ + struct xfs_buf *bp) /* buffer for block, if any */ +{ + int lblock_ok = 1; /* block passes checks */ + struct xfs_mount *mp; /* file system mount point */ + + mp = cur->bc_mp; + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + lblock_ok = lblock_ok && + uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid) && + block->bb_u.l.bb_blkno == cpu_to_be64( + bp ? bp->b_bn : XFS_BUF_DADDR_NULL); + } + + lblock_ok = lblock_ok && + be32_to_cpu(block->bb_magic) == xfs_btree_magic(cur) && + be16_to_cpu(block->bb_level) == level && + be16_to_cpu(block->bb_numrecs) <= + cur->bc_ops->get_maxrecs(cur, level) && + block->bb_u.l.bb_leftsib && + (block->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK) || + XFS_FSB_SANITY_CHECK(mp, + be64_to_cpu(block->bb_u.l.bb_leftsib))) && + block->bb_u.l.bb_rightsib && + (block->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK) || + XFS_FSB_SANITY_CHECK(mp, + be64_to_cpu(block->bb_u.l.bb_rightsib))); + + if (unlikely(XFS_TEST_ERROR(!lblock_ok, mp, + XFS_ERRTAG_BTREE_CHECK_LBLOCK, + XFS_RANDOM_BTREE_CHECK_LBLOCK))) { + if (bp) + trace_xfs_btree_corrupt(bp, _RET_IP_); + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); + return -EFSCORRUPTED; + } + return 0; +} + +STATIC int /* error (0 or EFSCORRUPTED) */ +xfs_btree_check_sblock( + struct xfs_btree_cur *cur, /* btree cursor */ + struct xfs_btree_block *block, /* btree short form block pointer */ + int level, /* level of the btree block */ + struct xfs_buf *bp) /* buffer containing block */ +{ + struct xfs_mount *mp; /* file system mount point */ + struct xfs_buf *agbp; /* buffer for ag. freespace struct */ + struct xfs_agf *agf; /* ag. freespace structure */ + xfs_agblock_t agflen; /* native ag. freespace length */ + int sblock_ok = 1; /* block passes checks */ + + mp = cur->bc_mp; + agbp = cur->bc_private.a.agbp; + agf = XFS_BUF_TO_AGF(agbp); + agflen = be32_to_cpu(agf->agf_length); + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + sblock_ok = sblock_ok && + uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid) && + block->bb_u.s.bb_blkno == cpu_to_be64( + bp ? bp->b_bn : XFS_BUF_DADDR_NULL); + } + + sblock_ok = sblock_ok && + be32_to_cpu(block->bb_magic) == xfs_btree_magic(cur) && + be16_to_cpu(block->bb_level) == level && + be16_to_cpu(block->bb_numrecs) <= + cur->bc_ops->get_maxrecs(cur, level) && + (block->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) || + be32_to_cpu(block->bb_u.s.bb_leftsib) < agflen) && + block->bb_u.s.bb_leftsib && + (block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) || + be32_to_cpu(block->bb_u.s.bb_rightsib) < agflen) && + block->bb_u.s.bb_rightsib; + + if (unlikely(XFS_TEST_ERROR(!sblock_ok, mp, + XFS_ERRTAG_BTREE_CHECK_SBLOCK, + XFS_RANDOM_BTREE_CHECK_SBLOCK))) { + if (bp) + trace_xfs_btree_corrupt(bp, _RET_IP_); + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); + return -EFSCORRUPTED; + } + return 0; +} + +/* + * Debug routine: check that block header is ok. + */ +int +xfs_btree_check_block( + struct xfs_btree_cur *cur, /* btree cursor */ + struct xfs_btree_block *block, /* generic btree block pointer */ + int level, /* level of the btree block */ + struct xfs_buf *bp) /* buffer containing block, if any */ +{ + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + return xfs_btree_check_lblock(cur, block, level, bp); + else + return xfs_btree_check_sblock(cur, block, level, bp); +} + +/* + * Check that (long) pointer is ok. + */ +int /* error (0 or EFSCORRUPTED) */ +xfs_btree_check_lptr( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_fsblock_t bno, /* btree block disk address */ + int level) /* btree block level */ +{ + XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, + level > 0 && + bno != NULLFSBLOCK && + XFS_FSB_SANITY_CHECK(cur->bc_mp, bno)); + return 0; +} + +#ifdef DEBUG +/* + * Check that (short) pointer is ok. + */ +STATIC int /* error (0 or EFSCORRUPTED) */ +xfs_btree_check_sptr( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agblock_t bno, /* btree block disk address */ + int level) /* btree block level */ +{ + xfs_agblock_t agblocks = cur->bc_mp->m_sb.sb_agblocks; + + XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, + level > 0 && + bno != NULLAGBLOCK && + bno != 0 && + bno < agblocks); + return 0; +} + +/* + * Check that block ptr is ok. + */ +STATIC int /* error (0 or EFSCORRUPTED) */ +xfs_btree_check_ptr( + struct xfs_btree_cur *cur, /* btree cursor */ + union xfs_btree_ptr *ptr, /* btree block disk address */ + int index, /* offset from ptr to check */ + int level) /* btree block level */ +{ + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { + return xfs_btree_check_lptr(cur, + be64_to_cpu((&ptr->l)[index]), level); + } else { + return xfs_btree_check_sptr(cur, + be32_to_cpu((&ptr->s)[index]), level); + } +} +#endif + +/* + * Calculate CRC on the whole btree block and stuff it into the + * long-form btree header. + * + * Prior to calculting the CRC, pull the LSN out of the buffer log item and put + * it into the buffer so recovery knows what the last modifcation was that made + * it to disk. + */ +void +xfs_btree_lblock_calc_crc( + struct xfs_buf *bp) +{ + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + struct xfs_buf_log_item *bip = bp->b_fspriv; + + if (!xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb)) + return; + if (bip) + block->bb_u.l.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn); + xfs_buf_update_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF); +} + +bool +xfs_btree_lblock_verify_crc( + struct xfs_buf *bp) +{ + if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb)) + return xfs_buf_verify_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF); + + return true; +} + +/* + * Calculate CRC on the whole btree block and stuff it into the + * short-form btree header. + * + * Prior to calculting the CRC, pull the LSN out of the buffer log item and put + * it into the buffer so recovery knows what the last modifcation was that made + * it to disk. + */ +void +xfs_btree_sblock_calc_crc( + struct xfs_buf *bp) +{ + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + struct xfs_buf_log_item *bip = bp->b_fspriv; + + if (!xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb)) + return; + if (bip) + block->bb_u.s.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn); + xfs_buf_update_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF); +} + +bool +xfs_btree_sblock_verify_crc( + struct xfs_buf *bp) +{ + if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb)) + return xfs_buf_verify_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF); + + return true; +} + +/* + * Delete the btree cursor. + */ +void +xfs_btree_del_cursor( + xfs_btree_cur_t *cur, /* btree cursor */ + int error) /* del because of error */ +{ + int i; /* btree level */ + + /* + * Clear the buffer pointers, and release the buffers. + * If we're doing this in the face of an error, we + * need to make sure to inspect all of the entries + * in the bc_bufs array for buffers to be unlocked. + * This is because some of the btree code works from + * level n down to 0, and if we get an error along + * the way we won't have initialized all the entries + * down to 0. + */ + for (i = 0; i < cur->bc_nlevels; i++) { + if (cur->bc_bufs[i]) + xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[i]); + else if (!error) + break; + } + /* + * Can't free a bmap cursor without having dealt with the + * allocated indirect blocks' accounting. + */ + ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP || + cur->bc_private.b.allocated == 0); + /* + * Free the cursor. + */ + kmem_zone_free(xfs_btree_cur_zone, cur); +} + +/* + * Duplicate the btree cursor. + * Allocate a new one, copy the record, re-get the buffers. + */ +int /* error */ +xfs_btree_dup_cursor( + xfs_btree_cur_t *cur, /* input cursor */ + xfs_btree_cur_t **ncur) /* output cursor */ +{ + xfs_buf_t *bp; /* btree block's buffer pointer */ + int error; /* error return value */ + int i; /* level number of btree block */ + xfs_mount_t *mp; /* mount structure for filesystem */ + xfs_btree_cur_t *new; /* new cursor value */ + xfs_trans_t *tp; /* transaction pointer, can be NULL */ + + tp = cur->bc_tp; + mp = cur->bc_mp; + + /* + * Allocate a new cursor like the old one. + */ + new = cur->bc_ops->dup_cursor(cur); + + /* + * Copy the record currently in the cursor. + */ + new->bc_rec = cur->bc_rec; + + /* + * For each level current, re-get the buffer and copy the ptr value. + */ + for (i = 0; i < new->bc_nlevels; i++) { + new->bc_ptrs[i] = cur->bc_ptrs[i]; + new->bc_ra[i] = cur->bc_ra[i]; + bp = cur->bc_bufs[i]; + if (bp) { + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, + XFS_BUF_ADDR(bp), mp->m_bsize, + 0, &bp, + cur->bc_ops->buf_ops); + if (error) { + xfs_btree_del_cursor(new, error); + *ncur = NULL; + return error; + } + } + new->bc_bufs[i] = bp; + } + *ncur = new; + return 0; +} + +/* + * XFS btree block layout and addressing: + * + * There are two types of blocks in the btree: leaf and non-leaf blocks. + * + * The leaf record start with a header then followed by records containing + * the values. A non-leaf block also starts with the same header, and + * then first contains lookup keys followed by an equal number of pointers + * to the btree blocks at the previous level. + * + * +--------+-------+-------+-------+-------+-------+-------+ + * Leaf: | header | rec 1 | rec 2 | rec 3 | rec 4 | rec 5 | rec N | + * +--------+-------+-------+-------+-------+-------+-------+ + * + * +--------+-------+-------+-------+-------+-------+-------+ + * Non-Leaf: | header | key 1 | key 2 | key N | ptr 1 | ptr 2 | ptr N | + * +--------+-------+-------+-------+-------+-------+-------+ + * + * The header is called struct xfs_btree_block for reasons better left unknown + * and comes in different versions for short (32bit) and long (64bit) block + * pointers. The record and key structures are defined by the btree instances + * and opaque to the btree core. The block pointers are simple disk endian + * integers, available in a short (32bit) and long (64bit) variant. + * + * The helpers below calculate the offset of a given record, key or pointer + * into a btree block (xfs_btree_*_offset) or return a pointer to the given + * record, key or pointer (xfs_btree_*_addr). Note that all addressing + * inside the btree block is done using indices starting at one, not zero! + */ + +/* + * Return size of the btree block header for this btree instance. + */ +static inline size_t xfs_btree_block_len(struct xfs_btree_cur *cur) +{ + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { + if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) + return XFS_BTREE_LBLOCK_CRC_LEN; + return XFS_BTREE_LBLOCK_LEN; + } + if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) + return XFS_BTREE_SBLOCK_CRC_LEN; + return XFS_BTREE_SBLOCK_LEN; +} + +/* + * Return size of btree block pointers for this btree instance. + */ +static inline size_t xfs_btree_ptr_len(struct xfs_btree_cur *cur) +{ + return (cur->bc_flags & XFS_BTREE_LONG_PTRS) ? + sizeof(__be64) : sizeof(__be32); +} + +/* + * Calculate offset of the n-th record in a btree block. + */ +STATIC size_t +xfs_btree_rec_offset( + struct xfs_btree_cur *cur, + int n) +{ + return xfs_btree_block_len(cur) + + (n - 1) * cur->bc_ops->rec_len; +} + +/* + * Calculate offset of the n-th key in a btree block. + */ +STATIC size_t +xfs_btree_key_offset( + struct xfs_btree_cur *cur, + int n) +{ + return xfs_btree_block_len(cur) + + (n - 1) * cur->bc_ops->key_len; +} + +/* + * Calculate offset of the n-th block pointer in a btree block. + */ +STATIC size_t +xfs_btree_ptr_offset( + struct xfs_btree_cur *cur, + int n, + int level) +{ + return xfs_btree_block_len(cur) + + cur->bc_ops->get_maxrecs(cur, level) * cur->bc_ops->key_len + + (n - 1) * xfs_btree_ptr_len(cur); +} + +/* + * Return a pointer to the n-th record in the btree block. + */ +STATIC union xfs_btree_rec * +xfs_btree_rec_addr( + struct xfs_btree_cur *cur, + int n, + struct xfs_btree_block *block) +{ + return (union xfs_btree_rec *) + ((char *)block + xfs_btree_rec_offset(cur, n)); +} + +/* + * Return a pointer to the n-th key in the btree block. + */ +STATIC union xfs_btree_key * +xfs_btree_key_addr( + struct xfs_btree_cur *cur, + int n, + struct xfs_btree_block *block) +{ + return (union xfs_btree_key *) + ((char *)block + xfs_btree_key_offset(cur, n)); +} + +/* + * Return a pointer to the n-th block pointer in the btree block. + */ +STATIC union xfs_btree_ptr * +xfs_btree_ptr_addr( + struct xfs_btree_cur *cur, + int n, + struct xfs_btree_block *block) +{ + int level = xfs_btree_get_level(block); + + ASSERT(block->bb_level != 0); + + return (union xfs_btree_ptr *) + ((char *)block + xfs_btree_ptr_offset(cur, n, level)); +} + +/* + * Get the root block which is stored in the inode. + * + * For now this btree implementation assumes the btree root is always + * stored in the if_broot field of an inode fork. + */ +STATIC struct xfs_btree_block * +xfs_btree_get_iroot( + struct xfs_btree_cur *cur) +{ + struct xfs_ifork *ifp; + + ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, cur->bc_private.b.whichfork); + return (struct xfs_btree_block *)ifp->if_broot; +} + +/* + * Retrieve the block pointer from the cursor at the given level. + * This may be an inode btree root or from a buffer. + */ +STATIC struct xfs_btree_block * /* generic btree block pointer */ +xfs_btree_get_block( + struct xfs_btree_cur *cur, /* btree cursor */ + int level, /* level in btree */ + struct xfs_buf **bpp) /* buffer containing the block */ +{ + if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && + (level == cur->bc_nlevels - 1)) { + *bpp = NULL; + return xfs_btree_get_iroot(cur); + } + + *bpp = cur->bc_bufs[level]; + return XFS_BUF_TO_BLOCK(*bpp); +} + +/* + * Get a buffer for the block, return it with no data read. + * Long-form addressing. + */ +xfs_buf_t * /* buffer for fsbno */ +xfs_btree_get_bufl( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_fsblock_t fsbno, /* file system block number */ + uint lock) /* lock flags for get_buf */ +{ + xfs_daddr_t d; /* real disk block address */ + + ASSERT(fsbno != NULLFSBLOCK); + d = XFS_FSB_TO_DADDR(mp, fsbno); + return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock); +} + +/* + * Get a buffer for the block, return it with no data read. + * Short-form addressing. + */ +xfs_buf_t * /* buffer for agno/agbno */ +xfs_btree_get_bufs( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_agnumber_t agno, /* allocation group number */ + xfs_agblock_t agbno, /* allocation group block number */ + uint lock) /* lock flags for get_buf */ +{ + xfs_daddr_t d; /* real disk block address */ + + ASSERT(agno != NULLAGNUMBER); + ASSERT(agbno != NULLAGBLOCK); + d = XFS_AGB_TO_DADDR(mp, agno, agbno); + return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock); +} + +/* + * Check for the cursor referring to the last block at the given level. + */ +int /* 1=is last block, 0=not last block */ +xfs_btree_islastblock( + xfs_btree_cur_t *cur, /* btree cursor */ + int level) /* level to check */ +{ + struct xfs_btree_block *block; /* generic btree block pointer */ + xfs_buf_t *bp; /* buffer containing block */ + + block = xfs_btree_get_block(cur, level, &bp); + xfs_btree_check_block(cur, block, level, bp); + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + return block->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK); + else + return block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK); +} + +/* + * Change the cursor to point to the first record at the given level. + * Other levels are unaffected. + */ +STATIC int /* success=1, failure=0 */ +xfs_btree_firstrec( + xfs_btree_cur_t *cur, /* btree cursor */ + int level) /* level to change */ +{ + struct xfs_btree_block *block; /* generic btree block pointer */ + xfs_buf_t *bp; /* buffer containing block */ + + /* + * Get the block pointer for this level. + */ + block = xfs_btree_get_block(cur, level, &bp); + xfs_btree_check_block(cur, block, level, bp); + /* + * It's empty, there is no such record. + */ + if (!block->bb_numrecs) + return 0; + /* + * Set the ptr value to 1, that's the first record/key. + */ + cur->bc_ptrs[level] = 1; + return 1; +} + +/* + * Change the cursor to point to the last record in the current block + * at the given level. Other levels are unaffected. + */ +STATIC int /* success=1, failure=0 */ +xfs_btree_lastrec( + xfs_btree_cur_t *cur, /* btree cursor */ + int level) /* level to change */ +{ + struct xfs_btree_block *block; /* generic btree block pointer */ + xfs_buf_t *bp; /* buffer containing block */ + + /* + * Get the block pointer for this level. + */ + block = xfs_btree_get_block(cur, level, &bp); + xfs_btree_check_block(cur, block, level, bp); + /* + * It's empty, there is no such record. + */ + if (!block->bb_numrecs) + return 0; + /* + * Set the ptr value to numrecs, that's the last record/key. + */ + cur->bc_ptrs[level] = be16_to_cpu(block->bb_numrecs); + return 1; +} + +/* + * Compute first and last byte offsets for the fields given. + * Interprets the offsets table, which contains struct field offsets. + */ +void +xfs_btree_offsets( + __int64_t fields, /* bitmask of fields */ + const short *offsets, /* table of field offsets */ + int nbits, /* number of bits to inspect */ + int *first, /* output: first byte offset */ + int *last) /* output: last byte offset */ +{ + int i; /* current bit number */ + __int64_t imask; /* mask for current bit number */ + + ASSERT(fields != 0); + /* + * Find the lowest bit, so the first byte offset. + */ + for (i = 0, imask = 1LL; ; i++, imask <<= 1) { + if (imask & fields) { + *first = offsets[i]; + break; + } + } + /* + * Find the highest bit, so the last byte offset. + */ + for (i = nbits - 1, imask = 1LL << i; ; i--, imask >>= 1) { + if (imask & fields) { + *last = offsets[i + 1] - 1; + break; + } + } +} + +/* + * Get a buffer for the block, return it read in. + * Long-form addressing. + */ +int +xfs_btree_read_bufl( + struct xfs_mount *mp, /* file system mount point */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_fsblock_t fsbno, /* file system block number */ + uint lock, /* lock flags for read_buf */ + struct xfs_buf **bpp, /* buffer for fsbno */ + int refval, /* ref count value for buffer */ + const struct xfs_buf_ops *ops) +{ + struct xfs_buf *bp; /* return value */ + xfs_daddr_t d; /* real disk block address */ + int error; + + ASSERT(fsbno != NULLFSBLOCK); + d = XFS_FSB_TO_DADDR(mp, fsbno); + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d, + mp->m_bsize, lock, &bp, ops); + if (error) + return error; + if (bp) + xfs_buf_set_ref(bp, refval); + *bpp = bp; + return 0; +} + +/* + * Read-ahead the block, don't wait for it, don't return a buffer. + * Long-form addressing. + */ +/* ARGSUSED */ +void +xfs_btree_reada_bufl( + struct xfs_mount *mp, /* file system mount point */ + xfs_fsblock_t fsbno, /* file system block number */ + xfs_extlen_t count, /* count of filesystem blocks */ + const struct xfs_buf_ops *ops) +{ + xfs_daddr_t d; + + ASSERT(fsbno != NULLFSBLOCK); + d = XFS_FSB_TO_DADDR(mp, fsbno); + xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops); +} + +/* + * Read-ahead the block, don't wait for it, don't return a buffer. + * Short-form addressing. + */ +/* ARGSUSED */ +void +xfs_btree_reada_bufs( + struct xfs_mount *mp, /* file system mount point */ + xfs_agnumber_t agno, /* allocation group number */ + xfs_agblock_t agbno, /* allocation group block number */ + xfs_extlen_t count, /* count of filesystem blocks */ + const struct xfs_buf_ops *ops) +{ + xfs_daddr_t d; + + ASSERT(agno != NULLAGNUMBER); + ASSERT(agbno != NULLAGBLOCK); + d = XFS_AGB_TO_DADDR(mp, agno, agbno); + xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops); +} + +STATIC int +xfs_btree_readahead_lblock( + struct xfs_btree_cur *cur, + int lr, + struct xfs_btree_block *block) +{ + int rval = 0; + xfs_fsblock_t left = be64_to_cpu(block->bb_u.l.bb_leftsib); + xfs_fsblock_t right = be64_to_cpu(block->bb_u.l.bb_rightsib); + + if ((lr & XFS_BTCUR_LEFTRA) && left != NULLFSBLOCK) { + xfs_btree_reada_bufl(cur->bc_mp, left, 1, + cur->bc_ops->buf_ops); + rval++; + } + + if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLFSBLOCK) { + xfs_btree_reada_bufl(cur->bc_mp, right, 1, + cur->bc_ops->buf_ops); + rval++; + } + + return rval; +} + +STATIC int +xfs_btree_readahead_sblock( + struct xfs_btree_cur *cur, + int lr, + struct xfs_btree_block *block) +{ + int rval = 0; + xfs_agblock_t left = be32_to_cpu(block->bb_u.s.bb_leftsib); + xfs_agblock_t right = be32_to_cpu(block->bb_u.s.bb_rightsib); + + + if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) { + xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno, + left, 1, cur->bc_ops->buf_ops); + rval++; + } + + if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) { + xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno, + right, 1, cur->bc_ops->buf_ops); + rval++; + } + + return rval; +} + +/* + * Read-ahead btree blocks, at the given level. + * Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA. + */ +STATIC int +xfs_btree_readahead( + struct xfs_btree_cur *cur, /* btree cursor */ + int lev, /* level in btree */ + int lr) /* left/right bits */ +{ + struct xfs_btree_block *block; + + /* + * No readahead needed if we are at the root level and the + * btree root is stored in the inode. + */ + if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && + (lev == cur->bc_nlevels - 1)) + return 0; + + if ((cur->bc_ra[lev] | lr) == cur->bc_ra[lev]) + return 0; + + cur->bc_ra[lev] |= lr; + block = XFS_BUF_TO_BLOCK(cur->bc_bufs[lev]); + + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + return xfs_btree_readahead_lblock(cur, lr, block); + return xfs_btree_readahead_sblock(cur, lr, block); +} + +STATIC xfs_daddr_t +xfs_btree_ptr_to_daddr( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr) +{ + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { + ASSERT(ptr->l != cpu_to_be64(NULLFSBLOCK)); + + return XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l)); + } else { + ASSERT(cur->bc_private.a.agno != NULLAGNUMBER); + ASSERT(ptr->s != cpu_to_be32(NULLAGBLOCK)); + + return XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_private.a.agno, + be32_to_cpu(ptr->s)); + } +} + +/* + * Readahead @count btree blocks at the given @ptr location. + * + * We don't need to care about long or short form btrees here as we have a + * method of converting the ptr directly to a daddr available to us. + */ +STATIC void +xfs_btree_readahead_ptr( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, + xfs_extlen_t count) +{ + xfs_buf_readahead(cur->bc_mp->m_ddev_targp, + xfs_btree_ptr_to_daddr(cur, ptr), + cur->bc_mp->m_bsize * count, cur->bc_ops->buf_ops); +} + +/* + * Set the buffer for level "lev" in the cursor to bp, releasing + * any previous buffer. + */ +STATIC void +xfs_btree_setbuf( + xfs_btree_cur_t *cur, /* btree cursor */ + int lev, /* level in btree */ + xfs_buf_t *bp) /* new buffer to set */ +{ + struct xfs_btree_block *b; /* btree block */ + + if (cur->bc_bufs[lev]) + xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[lev]); + cur->bc_bufs[lev] = bp; + cur->bc_ra[lev] = 0; + + b = XFS_BUF_TO_BLOCK(bp); + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { + if (b->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK)) + cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA; + if (b->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK)) + cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA; + } else { + if (b->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK)) + cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA; + if (b->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK)) + cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA; + } +} + +STATIC int +xfs_btree_ptr_is_null( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr) +{ + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + return ptr->l == cpu_to_be64(NULLFSBLOCK); + else + return ptr->s == cpu_to_be32(NULLAGBLOCK); +} + +STATIC void +xfs_btree_set_ptr_null( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr) +{ + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + ptr->l = cpu_to_be64(NULLFSBLOCK); + else + ptr->s = cpu_to_be32(NULLAGBLOCK); +} + +/* + * Get/set/init sibling pointers + */ +STATIC void +xfs_btree_get_sibling( + struct xfs_btree_cur *cur, + struct xfs_btree_block *block, + union xfs_btree_ptr *ptr, + int lr) +{ + ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB); + + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { + if (lr == XFS_BB_RIGHTSIB) + ptr->l = block->bb_u.l.bb_rightsib; + else + ptr->l = block->bb_u.l.bb_leftsib; + } else { + if (lr == XFS_BB_RIGHTSIB) + ptr->s = block->bb_u.s.bb_rightsib; + else + ptr->s = block->bb_u.s.bb_leftsib; + } +} + +STATIC void +xfs_btree_set_sibling( + struct xfs_btree_cur *cur, + struct xfs_btree_block *block, + union xfs_btree_ptr *ptr, + int lr) +{ + ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB); + + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { + if (lr == XFS_BB_RIGHTSIB) + block->bb_u.l.bb_rightsib = ptr->l; + else + block->bb_u.l.bb_leftsib = ptr->l; + } else { + if (lr == XFS_BB_RIGHTSIB) + block->bb_u.s.bb_rightsib = ptr->s; + else + block->bb_u.s.bb_leftsib = ptr->s; + } +} + +void +xfs_btree_init_block_int( + struct xfs_mount *mp, + struct xfs_btree_block *buf, + xfs_daddr_t blkno, + __u32 magic, + __u16 level, + __u16 numrecs, + __u64 owner, + unsigned int flags) +{ + buf->bb_magic = cpu_to_be32(magic); + buf->bb_level = cpu_to_be16(level); + buf->bb_numrecs = cpu_to_be16(numrecs); + + if (flags & XFS_BTREE_LONG_PTRS) { + buf->bb_u.l.bb_leftsib = cpu_to_be64(NULLFSBLOCK); + buf->bb_u.l.bb_rightsib = cpu_to_be64(NULLFSBLOCK); + if (flags & XFS_BTREE_CRC_BLOCKS) { + buf->bb_u.l.bb_blkno = cpu_to_be64(blkno); + buf->bb_u.l.bb_owner = cpu_to_be64(owner); + uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid); + buf->bb_u.l.bb_pad = 0; + buf->bb_u.l.bb_lsn = 0; + } + } else { + /* owner is a 32 bit value on short blocks */ + __u32 __owner = (__u32)owner; + + buf->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK); + buf->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK); + if (flags & XFS_BTREE_CRC_BLOCKS) { + buf->bb_u.s.bb_blkno = cpu_to_be64(blkno); + buf->bb_u.s.bb_owner = cpu_to_be32(__owner); + uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid); + buf->bb_u.s.bb_lsn = 0; + } + } +} + +void +xfs_btree_init_block( + struct xfs_mount *mp, + struct xfs_buf *bp, + __u32 magic, + __u16 level, + __u16 numrecs, + __u64 owner, + unsigned int flags) +{ + xfs_btree_init_block_int(mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn, + magic, level, numrecs, owner, flags); +} + +STATIC void +xfs_btree_init_block_cur( + struct xfs_btree_cur *cur, + struct xfs_buf *bp, + int level, + int numrecs) +{ + __u64 owner; + + /* + * we can pull the owner from the cursor right now as the different + * owners align directly with the pointer size of the btree. This may + * change in future, but is safe for current users of the generic btree + * code. + */ + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + owner = cur->bc_private.b.ip->i_ino; + else + owner = cur->bc_private.a.agno; + + xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn, + xfs_btree_magic(cur), level, numrecs, + owner, cur->bc_flags); +} + +/* + * Return true if ptr is the last record in the btree and + * we need to track updates to this record. The decision + * will be further refined in the update_lastrec method. + */ +STATIC int +xfs_btree_is_lastrec( + struct xfs_btree_cur *cur, + struct xfs_btree_block *block, + int level) +{ + union xfs_btree_ptr ptr; + + if (level > 0) + return 0; + if (!(cur->bc_flags & XFS_BTREE_LASTREC_UPDATE)) + return 0; + + xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB); + if (!xfs_btree_ptr_is_null(cur, &ptr)) + return 0; + return 1; +} + +STATIC void +xfs_btree_buf_to_ptr( + struct xfs_btree_cur *cur, + struct xfs_buf *bp, + union xfs_btree_ptr *ptr) +{ + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp, + XFS_BUF_ADDR(bp))); + else { + ptr->s = cpu_to_be32(xfs_daddr_to_agbno(cur->bc_mp, + XFS_BUF_ADDR(bp))); + } +} + +STATIC void +xfs_btree_set_refs( + struct xfs_btree_cur *cur, + struct xfs_buf *bp) +{ + switch (cur->bc_btnum) { + case XFS_BTNUM_BNO: + case XFS_BTNUM_CNT: + xfs_buf_set_ref(bp, XFS_ALLOC_BTREE_REF); + break; + case XFS_BTNUM_INO: + case XFS_BTNUM_FINO: + xfs_buf_set_ref(bp, XFS_INO_BTREE_REF); + break; + case XFS_BTNUM_BMAP: + xfs_buf_set_ref(bp, XFS_BMAP_BTREE_REF); + break; + default: + ASSERT(0); + } +} + +STATIC int +xfs_btree_get_buf_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, + int flags, + struct xfs_btree_block **block, + struct xfs_buf **bpp) +{ + struct xfs_mount *mp = cur->bc_mp; + xfs_daddr_t d; + + /* need to sort out how callers deal with failures first */ + ASSERT(!(flags & XBF_TRYLOCK)); + + d = xfs_btree_ptr_to_daddr(cur, ptr); + *bpp = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d, + mp->m_bsize, flags); + + if (!*bpp) + return -ENOMEM; + + (*bpp)->b_ops = cur->bc_ops->buf_ops; + *block = XFS_BUF_TO_BLOCK(*bpp); + return 0; +} + +/* + * Read in the buffer at the given ptr and return the buffer and + * the block pointer within the buffer. + */ +STATIC int +xfs_btree_read_buf_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, + int flags, + struct xfs_btree_block **block, + struct xfs_buf **bpp) +{ + struct xfs_mount *mp = cur->bc_mp; + xfs_daddr_t d; + int error; + + /* need to sort out how callers deal with failures first */ + ASSERT(!(flags & XBF_TRYLOCK)); + + d = xfs_btree_ptr_to_daddr(cur, ptr); + error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d, + mp->m_bsize, flags, bpp, + cur->bc_ops->buf_ops); + if (error) + return error; + + xfs_btree_set_refs(cur, *bpp); + *block = XFS_BUF_TO_BLOCK(*bpp); + return 0; +} + +/* + * Copy keys from one btree block to another. + */ +STATIC void +xfs_btree_copy_keys( + struct xfs_btree_cur *cur, + union xfs_btree_key *dst_key, + union xfs_btree_key *src_key, + int numkeys) +{ + ASSERT(numkeys >= 0); + memcpy(dst_key, src_key, numkeys * cur->bc_ops->key_len); +} + +/* + * Copy records from one btree block to another. + */ +STATIC void +xfs_btree_copy_recs( + struct xfs_btree_cur *cur, + union xfs_btree_rec *dst_rec, + union xfs_btree_rec *src_rec, + int numrecs) +{ + ASSERT(numrecs >= 0); + memcpy(dst_rec, src_rec, numrecs * cur->bc_ops->rec_len); +} + +/* + * Copy block pointers from one btree block to another. + */ +STATIC void +xfs_btree_copy_ptrs( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *dst_ptr, + union xfs_btree_ptr *src_ptr, + int numptrs) +{ + ASSERT(numptrs >= 0); + memcpy(dst_ptr, src_ptr, numptrs * xfs_btree_ptr_len(cur)); +} + +/* + * Shift keys one index left/right inside a single btree block. + */ +STATIC void +xfs_btree_shift_keys( + struct xfs_btree_cur *cur, + union xfs_btree_key *key, + int dir, + int numkeys) +{ + char *dst_key; + + ASSERT(numkeys >= 0); + ASSERT(dir == 1 || dir == -1); + + dst_key = (char *)key + (dir * cur->bc_ops->key_len); + memmove(dst_key, key, numkeys * cur->bc_ops->key_len); +} + +/* + * Shift records one index left/right inside a single btree block. + */ +STATIC void +xfs_btree_shift_recs( + struct xfs_btree_cur *cur, + union xfs_btree_rec *rec, + int dir, + int numrecs) +{ + char *dst_rec; + + ASSERT(numrecs >= 0); + ASSERT(dir == 1 || dir == -1); + + dst_rec = (char *)rec + (dir * cur->bc_ops->rec_len); + memmove(dst_rec, rec, numrecs * cur->bc_ops->rec_len); +} + +/* + * Shift block pointers one index left/right inside a single btree block. + */ +STATIC void +xfs_btree_shift_ptrs( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, + int dir, + int numptrs) +{ + char *dst_ptr; + + ASSERT(numptrs >= 0); + ASSERT(dir == 1 || dir == -1); + + dst_ptr = (char *)ptr + (dir * xfs_btree_ptr_len(cur)); + memmove(dst_ptr, ptr, numptrs * xfs_btree_ptr_len(cur)); +} + +/* + * Log key values from the btree block. + */ +STATIC void +xfs_btree_log_keys( + struct xfs_btree_cur *cur, + struct xfs_buf *bp, + int first, + int last) +{ + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_TRACE_ARGBII(cur, bp, first, last); + + if (bp) { + xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); + xfs_trans_log_buf(cur->bc_tp, bp, + xfs_btree_key_offset(cur, first), + xfs_btree_key_offset(cur, last + 1) - 1); + } else { + xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip, + xfs_ilog_fbroot(cur->bc_private.b.whichfork)); + } + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); +} + +/* + * Log record values from the btree block. + */ +void +xfs_btree_log_recs( + struct xfs_btree_cur *cur, + struct xfs_buf *bp, + int first, + int last) +{ + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_TRACE_ARGBII(cur, bp, first, last); + + xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); + xfs_trans_log_buf(cur->bc_tp, bp, + xfs_btree_rec_offset(cur, first), + xfs_btree_rec_offset(cur, last + 1) - 1); + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); +} + +/* + * Log block pointer fields from a btree block (nonleaf). + */ +STATIC void +xfs_btree_log_ptrs( + struct xfs_btree_cur *cur, /* btree cursor */ + struct xfs_buf *bp, /* buffer containing btree block */ + int first, /* index of first pointer to log */ + int last) /* index of last pointer to log */ +{ + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_TRACE_ARGBII(cur, bp, first, last); + + if (bp) { + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + int level = xfs_btree_get_level(block); + + xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); + xfs_trans_log_buf(cur->bc_tp, bp, + xfs_btree_ptr_offset(cur, first, level), + xfs_btree_ptr_offset(cur, last + 1, level) - 1); + } else { + xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip, + xfs_ilog_fbroot(cur->bc_private.b.whichfork)); + } + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); +} + +/* + * Log fields from a btree block header. + */ +void +xfs_btree_log_block( + struct xfs_btree_cur *cur, /* btree cursor */ + struct xfs_buf *bp, /* buffer containing btree block */ + int fields) /* mask of fields: XFS_BB_... */ +{ + int first; /* first byte offset logged */ + int last; /* last byte offset logged */ + static const short soffsets[] = { /* table of offsets (short) */ + offsetof(struct xfs_btree_block, bb_magic), + offsetof(struct xfs_btree_block, bb_level), + offsetof(struct xfs_btree_block, bb_numrecs), + offsetof(struct xfs_btree_block, bb_u.s.bb_leftsib), + offsetof(struct xfs_btree_block, bb_u.s.bb_rightsib), + offsetof(struct xfs_btree_block, bb_u.s.bb_blkno), + offsetof(struct xfs_btree_block, bb_u.s.bb_lsn), + offsetof(struct xfs_btree_block, bb_u.s.bb_uuid), + offsetof(struct xfs_btree_block, bb_u.s.bb_owner), + offsetof(struct xfs_btree_block, bb_u.s.bb_crc), + XFS_BTREE_SBLOCK_CRC_LEN + }; + static const short loffsets[] = { /* table of offsets (long) */ + offsetof(struct xfs_btree_block, bb_magic), + offsetof(struct xfs_btree_block, bb_level), + offsetof(struct xfs_btree_block, bb_numrecs), + offsetof(struct xfs_btree_block, bb_u.l.bb_leftsib), + offsetof(struct xfs_btree_block, bb_u.l.bb_rightsib), + offsetof(struct xfs_btree_block, bb_u.l.bb_blkno), + offsetof(struct xfs_btree_block, bb_u.l.bb_lsn), + offsetof(struct xfs_btree_block, bb_u.l.bb_uuid), + offsetof(struct xfs_btree_block, bb_u.l.bb_owner), + offsetof(struct xfs_btree_block, bb_u.l.bb_crc), + offsetof(struct xfs_btree_block, bb_u.l.bb_pad), + XFS_BTREE_LBLOCK_CRC_LEN + }; + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_TRACE_ARGBI(cur, bp, fields); + + if (bp) { + int nbits; + + if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) { + /* + * We don't log the CRC when updating a btree + * block but instead recreate it during log + * recovery. As the log buffers have checksums + * of their own this is safe and avoids logging a crc + * update in a lot of places. + */ + if (fields == XFS_BB_ALL_BITS) + fields = XFS_BB_ALL_BITS_CRC; + nbits = XFS_BB_NUM_BITS_CRC; + } else { + nbits = XFS_BB_NUM_BITS; + } + xfs_btree_offsets(fields, + (cur->bc_flags & XFS_BTREE_LONG_PTRS) ? + loffsets : soffsets, + nbits, &first, &last); + xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); + xfs_trans_log_buf(cur->bc_tp, bp, first, last); + } else { + xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip, + xfs_ilog_fbroot(cur->bc_private.b.whichfork)); + } + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); +} + +/* + * Increment cursor by one record at the level. + * For nonzero levels the leaf-ward information is untouched. + */ +int /* error */ +xfs_btree_increment( + struct xfs_btree_cur *cur, + int level, + int *stat) /* success/failure */ +{ + struct xfs_btree_block *block; + union xfs_btree_ptr ptr; + struct xfs_buf *bp; + int error; /* error return value */ + int lev; + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_TRACE_ARGI(cur, level); + + ASSERT(level < cur->bc_nlevels); + + /* Read-ahead to the right at this level. */ + xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA); + + /* Get a pointer to the btree block. */ + block = xfs_btree_get_block(cur, level, &bp); + +#ifdef DEBUG + error = xfs_btree_check_block(cur, block, level, bp); + if (error) + goto error0; +#endif + + /* We're done if we remain in the block after the increment. */ + if (++cur->bc_ptrs[level] <= xfs_btree_get_numrecs(block)) + goto out1; + + /* Fail if we just went off the right edge of the tree. */ + xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB); + if (xfs_btree_ptr_is_null(cur, &ptr)) + goto out0; + + XFS_BTREE_STATS_INC(cur, increment); + + /* + * March up the tree incrementing pointers. + * Stop when we don't go off the right edge of a block. + */ + for (lev = level + 1; lev < cur->bc_nlevels; lev++) { + block = xfs_btree_get_block(cur, lev, &bp); + +#ifdef DEBUG + error = xfs_btree_check_block(cur, block, lev, bp); + if (error) + goto error0; +#endif + + if (++cur->bc_ptrs[lev] <= xfs_btree_get_numrecs(block)) + break; + + /* Read-ahead the right block for the next loop. */ + xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA); + } + + /* + * If we went off the root then we are either seriously + * confused or have the tree root in an inode. + */ + if (lev == cur->bc_nlevels) { + if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) + goto out0; + ASSERT(0); + error = -EFSCORRUPTED; + goto error0; + } + ASSERT(lev < cur->bc_nlevels); + + /* + * Now walk back down the tree, fixing up the cursor's buffer + * pointers and key numbers. + */ + for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) { + union xfs_btree_ptr *ptrp; + + ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block); + --lev; + error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp); + if (error) + goto error0; + + xfs_btree_setbuf(cur, lev, bp); + cur->bc_ptrs[lev] = 1; + } +out1: + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 1; + return 0; + +out0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 0; + return 0; + +error0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; +} + +/* + * Decrement cursor by one record at the level. + * For nonzero levels the leaf-ward information is untouched. + */ +int /* error */ +xfs_btree_decrement( + struct xfs_btree_cur *cur, + int level, + int *stat) /* success/failure */ +{ + struct xfs_btree_block *block; + xfs_buf_t *bp; + int error; /* error return value */ + int lev; + union xfs_btree_ptr ptr; + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_TRACE_ARGI(cur, level); + + ASSERT(level < cur->bc_nlevels); + + /* Read-ahead to the left at this level. */ + xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA); + + /* We're done if we remain in the block after the decrement. */ + if (--cur->bc_ptrs[level] > 0) + goto out1; + + /* Get a pointer to the btree block. */ + block = xfs_btree_get_block(cur, level, &bp); + +#ifdef DEBUG + error = xfs_btree_check_block(cur, block, level, bp); + if (error) + goto error0; +#endif + + /* Fail if we just went off the left edge of the tree. */ + xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB); + if (xfs_btree_ptr_is_null(cur, &ptr)) + goto out0; + + XFS_BTREE_STATS_INC(cur, decrement); + + /* + * March up the tree decrementing pointers. + * Stop when we don't go off the left edge of a block. + */ + for (lev = level + 1; lev < cur->bc_nlevels; lev++) { + if (--cur->bc_ptrs[lev] > 0) + break; + /* Read-ahead the left block for the next loop. */ + xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA); + } + + /* + * If we went off the root then we are seriously confused. + * or the root of the tree is in an inode. + */ + if (lev == cur->bc_nlevels) { + if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) + goto out0; + ASSERT(0); + error = -EFSCORRUPTED; + goto error0; + } + ASSERT(lev < cur->bc_nlevels); + + /* + * Now walk back down the tree, fixing up the cursor's buffer + * pointers and key numbers. + */ + for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) { + union xfs_btree_ptr *ptrp; + + ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block); + --lev; + error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp); + if (error) + goto error0; + xfs_btree_setbuf(cur, lev, bp); + cur->bc_ptrs[lev] = xfs_btree_get_numrecs(block); + } +out1: + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 1; + return 0; + +out0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 0; + return 0; + +error0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; +} + +STATIC int +xfs_btree_lookup_get_block( + struct xfs_btree_cur *cur, /* btree cursor */ + int level, /* level in the btree */ + union xfs_btree_ptr *pp, /* ptr to btree block */ + struct xfs_btree_block **blkp) /* return btree block */ +{ + struct xfs_buf *bp; /* buffer pointer for btree block */ + int error = 0; + + /* special case the root block if in an inode */ + if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && + (level == cur->bc_nlevels - 1)) { + *blkp = xfs_btree_get_iroot(cur); + return 0; + } + + /* + * If the old buffer at this level for the disk address we are + * looking for re-use it. + * + * Otherwise throw it away and get a new one. + */ + bp = cur->bc_bufs[level]; + if (bp && XFS_BUF_ADDR(bp) == xfs_btree_ptr_to_daddr(cur, pp)) { + *blkp = XFS_BUF_TO_BLOCK(bp); + return 0; + } + + error = xfs_btree_read_buf_block(cur, pp, 0, blkp, &bp); + if (error) + return error; + + xfs_btree_setbuf(cur, level, bp); + return 0; +} + +/* + * Get current search key. For level 0 we don't actually have a key + * structure so we make one up from the record. For all other levels + * we just return the right key. + */ +STATIC union xfs_btree_key * +xfs_lookup_get_search_key( + struct xfs_btree_cur *cur, + int level, + int keyno, + struct xfs_btree_block *block, + union xfs_btree_key *kp) +{ + if (level == 0) { + cur->bc_ops->init_key_from_rec(kp, + xfs_btree_rec_addr(cur, keyno, block)); + return kp; + } + + return xfs_btree_key_addr(cur, keyno, block); +} + +/* + * Lookup the record. The cursor is made to point to it, based on dir. + * stat is set to 0 if can't find any such record, 1 for success. + */ +int /* error */ +xfs_btree_lookup( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_lookup_t dir, /* <=, ==, or >= */ + int *stat) /* success/failure */ +{ + struct xfs_btree_block *block; /* current btree block */ + __int64_t diff; /* difference for the current key */ + int error; /* error return value */ + int keyno; /* current key number */ + int level; /* level in the btree */ + union xfs_btree_ptr *pp; /* ptr to btree block */ + union xfs_btree_ptr ptr; /* ptr to btree block */ + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_TRACE_ARGI(cur, dir); + + XFS_BTREE_STATS_INC(cur, lookup); + + block = NULL; + keyno = 0; + + /* initialise start pointer from cursor */ + cur->bc_ops->init_ptr_from_cur(cur, &ptr); + pp = &ptr; + + /* + * Iterate over each level in the btree, starting at the root. + * For each level above the leaves, find the key we need, based + * on the lookup record, then follow the corresponding block + * pointer down to the next level. + */ + for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) { + /* Get the block we need to do the lookup on. */ + error = xfs_btree_lookup_get_block(cur, level, pp, &block); + if (error) + goto error0; + + if (diff == 0) { + /* + * If we already had a key match at a higher level, we + * know we need to use the first entry in this block. + */ + keyno = 1; + } else { + /* Otherwise search this block. Do a binary search. */ + + int high; /* high entry number */ + int low; /* low entry number */ + + /* Set low and high entry numbers, 1-based. */ + low = 1; + high = xfs_btree_get_numrecs(block); + if (!high) { + /* Block is empty, must be an empty leaf. */ + ASSERT(level == 0 && cur->bc_nlevels == 1); + + cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE; + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 0; + return 0; + } + + /* Binary search the block. */ + while (low <= high) { + union xfs_btree_key key; + union xfs_btree_key *kp; + + XFS_BTREE_STATS_INC(cur, compare); + + /* keyno is average of low and high. */ + keyno = (low + high) >> 1; + + /* Get current search key */ + kp = xfs_lookup_get_search_key(cur, level, + keyno, block, &key); + + /* + * Compute difference to get next direction: + * - less than, move right + * - greater than, move left + * - equal, we're done + */ + diff = cur->bc_ops->key_diff(cur, kp); + if (diff < 0) + low = keyno + 1; + else if (diff > 0) + high = keyno - 1; + else + break; + } + } + + /* + * If there are more levels, set up for the next level + * by getting the block number and filling in the cursor. + */ + if (level > 0) { + /* + * If we moved left, need the previous key number, + * unless there isn't one. + */ + if (diff > 0 && --keyno < 1) + keyno = 1; + pp = xfs_btree_ptr_addr(cur, keyno, block); + +#ifdef DEBUG + error = xfs_btree_check_ptr(cur, pp, 0, level); + if (error) + goto error0; +#endif + cur->bc_ptrs[level] = keyno; + } + } + + /* Done with the search. See if we need to adjust the results. */ + if (dir != XFS_LOOKUP_LE && diff < 0) { + keyno++; + /* + * If ge search and we went off the end of the block, but it's + * not the last block, we're in the wrong block. + */ + xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB); + if (dir == XFS_LOOKUP_GE && + keyno > xfs_btree_get_numrecs(block) && + !xfs_btree_ptr_is_null(cur, &ptr)) { + int i; + + cur->bc_ptrs[0] = keyno; + error = xfs_btree_increment(cur, 0, &i); + if (error) + goto error0; + XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1); + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 1; + return 0; + } + } else if (dir == XFS_LOOKUP_LE && diff > 0) + keyno--; + cur->bc_ptrs[0] = keyno; + + /* Return if we succeeded or not. */ + if (keyno == 0 || keyno > xfs_btree_get_numrecs(block)) + *stat = 0; + else if (dir != XFS_LOOKUP_EQ || diff == 0) + *stat = 1; + else + *stat = 0; + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + return 0; + +error0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; +} + +/* + * Update keys at all levels from here to the root along the cursor's path. + */ +STATIC int +xfs_btree_updkey( + struct xfs_btree_cur *cur, + union xfs_btree_key *keyp, + int level) +{ + struct xfs_btree_block *block; + struct xfs_buf *bp; + union xfs_btree_key *kp; + int ptr; + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_TRACE_ARGIK(cur, level, keyp); + + ASSERT(!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) || level >= 1); + + /* + * Go up the tree from this level toward the root. + * At each level, update the key value to the value input. + * Stop when we reach a level where the cursor isn't pointing + * at the first entry in the block. + */ + for (ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) { +#ifdef DEBUG + int error; +#endif + block = xfs_btree_get_block(cur, level, &bp); +#ifdef DEBUG + error = xfs_btree_check_block(cur, block, level, bp); + if (error) { + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; + } +#endif + ptr = cur->bc_ptrs[level]; + kp = xfs_btree_key_addr(cur, ptr, block); + xfs_btree_copy_keys(cur, kp, keyp, 1); + xfs_btree_log_keys(cur, bp, ptr, ptr); + } + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + return 0; +} + +/* + * Update the record referred to by cur to the value in the + * given record. This either works (return 0) or gets an + * EFSCORRUPTED error. + */ +int +xfs_btree_update( + struct xfs_btree_cur *cur, + union xfs_btree_rec *rec) +{ + struct xfs_btree_block *block; + struct xfs_buf *bp; + int error; + int ptr; + union xfs_btree_rec *rp; + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_TRACE_ARGR(cur, rec); + + /* Pick up the current block. */ + block = xfs_btree_get_block(cur, 0, &bp); + +#ifdef DEBUG + error = xfs_btree_check_block(cur, block, 0, bp); + if (error) + goto error0; +#endif + /* Get the address of the rec to be updated. */ + ptr = cur->bc_ptrs[0]; + rp = xfs_btree_rec_addr(cur, ptr, block); + + /* Fill in the new contents and log them. */ + xfs_btree_copy_recs(cur, rp, rec, 1); + xfs_btree_log_recs(cur, bp, ptr, ptr); + + /* + * If we are tracking the last record in the tree and + * we are at the far right edge of the tree, update it. + */ + if (xfs_btree_is_lastrec(cur, block, 0)) { + cur->bc_ops->update_lastrec(cur, block, rec, + ptr, LASTREC_UPDATE); + } + + /* Updating first rec in leaf. Pass new key value up to our parent. */ + if (ptr == 1) { + union xfs_btree_key key; + + cur->bc_ops->init_key_from_rec(&key, rec); + error = xfs_btree_updkey(cur, &key, 1); + if (error) + goto error0; + } + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + return 0; + +error0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; +} + +/* + * Move 1 record left from cur/level if possible. + * Update cur to reflect the new path. + */ +STATIC int /* error */ +xfs_btree_lshift( + struct xfs_btree_cur *cur, + int level, + int *stat) /* success/failure */ +{ + union xfs_btree_key key; /* btree key */ + struct xfs_buf *lbp; /* left buffer pointer */ + struct xfs_btree_block *left; /* left btree block */ + int lrecs; /* left record count */ + struct xfs_buf *rbp; /* right buffer pointer */ + struct xfs_btree_block *right; /* right btree block */ + int rrecs; /* right record count */ + union xfs_btree_ptr lptr; /* left btree pointer */ + union xfs_btree_key *rkp = NULL; /* right btree key */ + union xfs_btree_ptr *rpp = NULL; /* right address pointer */ + union xfs_btree_rec *rrp = NULL; /* right record pointer */ + int error; /* error return value */ + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_TRACE_ARGI(cur, level); + + if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && + level == cur->bc_nlevels - 1) + goto out0; + + /* Set up variables for this block as "right". */ + right = xfs_btree_get_block(cur, level, &rbp); + +#ifdef DEBUG + error = xfs_btree_check_block(cur, right, level, rbp); + if (error) + goto error0; +#endif + + /* If we've got no left sibling then we can't shift an entry left. */ + xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB); + if (xfs_btree_ptr_is_null(cur, &lptr)) + goto out0; + + /* + * If the cursor entry is the one that would be moved, don't + * do it... it's too complicated. + */ + if (cur->bc_ptrs[level] <= 1) + goto out0; + + /* Set up the left neighbor as "left". */ + error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp); + if (error) + goto error0; + + /* If it's full, it can't take another entry. */ + lrecs = xfs_btree_get_numrecs(left); + if (lrecs == cur->bc_ops->get_maxrecs(cur, level)) + goto out0; + + rrecs = xfs_btree_get_numrecs(right); + + /* + * We add one entry to the left side and remove one for the right side. + * Account for it here, the changes will be updated on disk and logged + * later. + */ + lrecs++; + rrecs--; + + XFS_BTREE_STATS_INC(cur, lshift); + XFS_BTREE_STATS_ADD(cur, moves, 1); + + /* + * If non-leaf, copy a key and a ptr to the left block. + * Log the changes to the left block. + */ + if (level > 0) { + /* It's a non-leaf. Move keys and pointers. */ + union xfs_btree_key *lkp; /* left btree key */ + union xfs_btree_ptr *lpp; /* left address pointer */ + + lkp = xfs_btree_key_addr(cur, lrecs, left); + rkp = xfs_btree_key_addr(cur, 1, right); + + lpp = xfs_btree_ptr_addr(cur, lrecs, left); + rpp = xfs_btree_ptr_addr(cur, 1, right); +#ifdef DEBUG + error = xfs_btree_check_ptr(cur, rpp, 0, level); + if (error) + goto error0; +#endif + xfs_btree_copy_keys(cur, lkp, rkp, 1); + xfs_btree_copy_ptrs(cur, lpp, rpp, 1); + + xfs_btree_log_keys(cur, lbp, lrecs, lrecs); + xfs_btree_log_ptrs(cur, lbp, lrecs, lrecs); + + ASSERT(cur->bc_ops->keys_inorder(cur, + xfs_btree_key_addr(cur, lrecs - 1, left), lkp)); + } else { + /* It's a leaf. Move records. */ + union xfs_btree_rec *lrp; /* left record pointer */ + + lrp = xfs_btree_rec_addr(cur, lrecs, left); + rrp = xfs_btree_rec_addr(cur, 1, right); + + xfs_btree_copy_recs(cur, lrp, rrp, 1); + xfs_btree_log_recs(cur, lbp, lrecs, lrecs); + + ASSERT(cur->bc_ops->recs_inorder(cur, + xfs_btree_rec_addr(cur, lrecs - 1, left), lrp)); + } + + xfs_btree_set_numrecs(left, lrecs); + xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS); + + xfs_btree_set_numrecs(right, rrecs); + xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS); + + /* + * Slide the contents of right down one entry. + */ + XFS_BTREE_STATS_ADD(cur, moves, rrecs - 1); + if (level > 0) { + /* It's a nonleaf. operate on keys and ptrs */ +#ifdef DEBUG + int i; /* loop index */ + + for (i = 0; i < rrecs; i++) { + error = xfs_btree_check_ptr(cur, rpp, i + 1, level); + if (error) + goto error0; + } +#endif + xfs_btree_shift_keys(cur, + xfs_btree_key_addr(cur, 2, right), + -1, rrecs); + xfs_btree_shift_ptrs(cur, + xfs_btree_ptr_addr(cur, 2, right), + -1, rrecs); + + xfs_btree_log_keys(cur, rbp, 1, rrecs); + xfs_btree_log_ptrs(cur, rbp, 1, rrecs); + } else { + /* It's a leaf. operate on records */ + xfs_btree_shift_recs(cur, + xfs_btree_rec_addr(cur, 2, right), + -1, rrecs); + xfs_btree_log_recs(cur, rbp, 1, rrecs); + + /* + * If it's the first record in the block, we'll need a key + * structure to pass up to the next level (updkey). + */ + cur->bc_ops->init_key_from_rec(&key, + xfs_btree_rec_addr(cur, 1, right)); + rkp = &key; + } + + /* Update the parent key values of right. */ + error = xfs_btree_updkey(cur, rkp, level + 1); + if (error) + goto error0; + + /* Slide the cursor value left one. */ + cur->bc_ptrs[level]--; + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 1; + return 0; + +out0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 0; + return 0; + +error0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; +} + +/* + * Move 1 record right from cur/level if possible. + * Update cur to reflect the new path. + */ +STATIC int /* error */ +xfs_btree_rshift( + struct xfs_btree_cur *cur, + int level, + int *stat) /* success/failure */ +{ + union xfs_btree_key key; /* btree key */ + struct xfs_buf *lbp; /* left buffer pointer */ + struct xfs_btree_block *left; /* left btree block */ + struct xfs_buf *rbp; /* right buffer pointer */ + struct xfs_btree_block *right; /* right btree block */ + struct xfs_btree_cur *tcur; /* temporary btree cursor */ + union xfs_btree_ptr rptr; /* right block pointer */ + union xfs_btree_key *rkp; /* right btree key */ + int rrecs; /* right record count */ + int lrecs; /* left record count */ + int error; /* error return value */ + int i; /* loop counter */ + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_TRACE_ARGI(cur, level); + + if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && + (level == cur->bc_nlevels - 1)) + goto out0; + + /* Set up variables for this block as "left". */ + left = xfs_btree_get_block(cur, level, &lbp); + +#ifdef DEBUG + error = xfs_btree_check_block(cur, left, level, lbp); + if (error) + goto error0; +#endif + + /* If we've got no right sibling then we can't shift an entry right. */ + xfs_btree_get_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB); + if (xfs_btree_ptr_is_null(cur, &rptr)) + goto out0; + + /* + * If the cursor entry is the one that would be moved, don't + * do it... it's too complicated. + */ + lrecs = xfs_btree_get_numrecs(left); + if (cur->bc_ptrs[level] >= lrecs) + goto out0; + + /* Set up the right neighbor as "right". */ + error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp); + if (error) + goto error0; + + /* If it's full, it can't take another entry. */ + rrecs = xfs_btree_get_numrecs(right); + if (rrecs == cur->bc_ops->get_maxrecs(cur, level)) + goto out0; + + XFS_BTREE_STATS_INC(cur, rshift); + XFS_BTREE_STATS_ADD(cur, moves, rrecs); + + /* + * Make a hole at the start of the right neighbor block, then + * copy the last left block entry to the hole. + */ + if (level > 0) { + /* It's a nonleaf. make a hole in the keys and ptrs */ + union xfs_btree_key *lkp; + union xfs_btree_ptr *lpp; + union xfs_btree_ptr *rpp; + + lkp = xfs_btree_key_addr(cur, lrecs, left); + lpp = xfs_btree_ptr_addr(cur, lrecs, left); + rkp = xfs_btree_key_addr(cur, 1, right); + rpp = xfs_btree_ptr_addr(cur, 1, right); + +#ifdef DEBUG + for (i = rrecs - 1; i >= 0; i--) { + error = xfs_btree_check_ptr(cur, rpp, i, level); + if (error) + goto error0; + } +#endif + + xfs_btree_shift_keys(cur, rkp, 1, rrecs); + xfs_btree_shift_ptrs(cur, rpp, 1, rrecs); + +#ifdef DEBUG + error = xfs_btree_check_ptr(cur, lpp, 0, level); + if (error) + goto error0; +#endif + + /* Now put the new data in, and log it. */ + xfs_btree_copy_keys(cur, rkp, lkp, 1); + xfs_btree_copy_ptrs(cur, rpp, lpp, 1); + + xfs_btree_log_keys(cur, rbp, 1, rrecs + 1); + xfs_btree_log_ptrs(cur, rbp, 1, rrecs + 1); + + ASSERT(cur->bc_ops->keys_inorder(cur, rkp, + xfs_btree_key_addr(cur, 2, right))); + } else { + /* It's a leaf. make a hole in the records */ + union xfs_btree_rec *lrp; + union xfs_btree_rec *rrp; + + lrp = xfs_btree_rec_addr(cur, lrecs, left); + rrp = xfs_btree_rec_addr(cur, 1, right); + + xfs_btree_shift_recs(cur, rrp, 1, rrecs); + + /* Now put the new data in, and log it. */ + xfs_btree_copy_recs(cur, rrp, lrp, 1); + xfs_btree_log_recs(cur, rbp, 1, rrecs + 1); + + cur->bc_ops->init_key_from_rec(&key, rrp); + rkp = &key; + + ASSERT(cur->bc_ops->recs_inorder(cur, rrp, + xfs_btree_rec_addr(cur, 2, right))); + } + + /* + * Decrement and log left's numrecs, bump and log right's numrecs. + */ + xfs_btree_set_numrecs(left, --lrecs); + xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS); + + xfs_btree_set_numrecs(right, ++rrecs); + xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS); + + /* + * Using a temporary cursor, update the parent key values of the + * block on the right. + */ + error = xfs_btree_dup_cursor(cur, &tcur); + if (error) + goto error0; + i = xfs_btree_lastrec(tcur, level); + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0); + + error = xfs_btree_increment(tcur, level, &i); + if (error) + goto error1; + + error = xfs_btree_updkey(tcur, rkp, level + 1); + if (error) + goto error1; + + xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 1; + return 0; + +out0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 0; + return 0; + +error0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; + +error1: + XFS_BTREE_TRACE_CURSOR(tcur, XBT_ERROR); + xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); + return error; +} + +/* + * Split cur/level block in half. + * Return new block number and the key to its first + * record (to be inserted into parent). + */ +STATIC int /* error */ +__xfs_btree_split( + struct xfs_btree_cur *cur, + int level, + union xfs_btree_ptr *ptrp, + union xfs_btree_key *key, + struct xfs_btree_cur **curp, + int *stat) /* success/failure */ +{ + union xfs_btree_ptr lptr; /* left sibling block ptr */ + struct xfs_buf *lbp; /* left buffer pointer */ + struct xfs_btree_block *left; /* left btree block */ + union xfs_btree_ptr rptr; /* right sibling block ptr */ + struct xfs_buf *rbp; /* right buffer pointer */ + struct xfs_btree_block *right; /* right btree block */ + union xfs_btree_ptr rrptr; /* right-right sibling ptr */ + struct xfs_buf *rrbp; /* right-right buffer pointer */ + struct xfs_btree_block *rrblock; /* right-right btree block */ + int lrecs; + int rrecs; + int src_index; + int error; /* error return value */ +#ifdef DEBUG + int i; +#endif + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_TRACE_ARGIPK(cur, level, *ptrp, key); + + XFS_BTREE_STATS_INC(cur, split); + + /* Set up left block (current one). */ + left = xfs_btree_get_block(cur, level, &lbp); + +#ifdef DEBUG + error = xfs_btree_check_block(cur, left, level, lbp); + if (error) + goto error0; +#endif + + xfs_btree_buf_to_ptr(cur, lbp, &lptr); + + /* Allocate the new block. If we can't do it, we're toast. Give up. */ + error = cur->bc_ops->alloc_block(cur, &lptr, &rptr, stat); + if (error) + goto error0; + if (*stat == 0) + goto out0; + XFS_BTREE_STATS_INC(cur, alloc); + + /* Set up the new block as "right". */ + error = xfs_btree_get_buf_block(cur, &rptr, 0, &right, &rbp); + if (error) + goto error0; + + /* Fill in the btree header for the new right block. */ + xfs_btree_init_block_cur(cur, rbp, xfs_btree_get_level(left), 0); + + /* + * Split the entries between the old and the new block evenly. + * Make sure that if there's an odd number of entries now, that + * each new block will have the same number of entries. + */ + lrecs = xfs_btree_get_numrecs(left); + rrecs = lrecs / 2; + if ((lrecs & 1) && cur->bc_ptrs[level] <= rrecs + 1) + rrecs++; + src_index = (lrecs - rrecs + 1); + + XFS_BTREE_STATS_ADD(cur, moves, rrecs); + + /* + * Copy btree block entries from the left block over to the + * new block, the right. Update the right block and log the + * changes. + */ + if (level > 0) { + /* It's a non-leaf. Move keys and pointers. */ + union xfs_btree_key *lkp; /* left btree key */ + union xfs_btree_ptr *lpp; /* left address pointer */ + union xfs_btree_key *rkp; /* right btree key */ + union xfs_btree_ptr *rpp; /* right address pointer */ + + lkp = xfs_btree_key_addr(cur, src_index, left); + lpp = xfs_btree_ptr_addr(cur, src_index, left); + rkp = xfs_btree_key_addr(cur, 1, right); + rpp = xfs_btree_ptr_addr(cur, 1, right); + +#ifdef DEBUG + for (i = src_index; i < rrecs; i++) { + error = xfs_btree_check_ptr(cur, lpp, i, level); + if (error) + goto error0; + } +#endif + + xfs_btree_copy_keys(cur, rkp, lkp, rrecs); + xfs_btree_copy_ptrs(cur, rpp, lpp, rrecs); + + xfs_btree_log_keys(cur, rbp, 1, rrecs); + xfs_btree_log_ptrs(cur, rbp, 1, rrecs); + + /* Grab the keys to the entries moved to the right block */ + xfs_btree_copy_keys(cur, key, rkp, 1); + } else { + /* It's a leaf. Move records. */ + union xfs_btree_rec *lrp; /* left record pointer */ + union xfs_btree_rec *rrp; /* right record pointer */ + + lrp = xfs_btree_rec_addr(cur, src_index, left); + rrp = xfs_btree_rec_addr(cur, 1, right); + + xfs_btree_copy_recs(cur, rrp, lrp, rrecs); + xfs_btree_log_recs(cur, rbp, 1, rrecs); + + cur->bc_ops->init_key_from_rec(key, + xfs_btree_rec_addr(cur, 1, right)); + } + + + /* + * Find the left block number by looking in the buffer. + * Adjust numrecs, sibling pointers. + */ + xfs_btree_get_sibling(cur, left, &rrptr, XFS_BB_RIGHTSIB); + xfs_btree_set_sibling(cur, right, &rrptr, XFS_BB_RIGHTSIB); + xfs_btree_set_sibling(cur, right, &lptr, XFS_BB_LEFTSIB); + xfs_btree_set_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB); + + lrecs -= rrecs; + xfs_btree_set_numrecs(left, lrecs); + xfs_btree_set_numrecs(right, xfs_btree_get_numrecs(right) + rrecs); + + xfs_btree_log_block(cur, rbp, XFS_BB_ALL_BITS); + xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB); + + /* + * If there's a block to the new block's right, make that block + * point back to right instead of to left. + */ + if (!xfs_btree_ptr_is_null(cur, &rrptr)) { + error = xfs_btree_read_buf_block(cur, &rrptr, + 0, &rrblock, &rrbp); + if (error) + goto error0; + xfs_btree_set_sibling(cur, rrblock, &rptr, XFS_BB_LEFTSIB); + xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB); + } + /* + * If the cursor is really in the right block, move it there. + * If it's just pointing past the last entry in left, then we'll + * insert there, so don't change anything in that case. + */ + if (cur->bc_ptrs[level] > lrecs + 1) { + xfs_btree_setbuf(cur, level, rbp); + cur->bc_ptrs[level] -= lrecs; + } + /* + * If there are more levels, we'll need another cursor which refers + * the right block, no matter where this cursor was. + */ + if (level + 1 < cur->bc_nlevels) { + error = xfs_btree_dup_cursor(cur, curp); + if (error) + goto error0; + (*curp)->bc_ptrs[level + 1]++; + } + *ptrp = rptr; + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 1; + return 0; +out0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 0; + return 0; + +error0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; +} + +struct xfs_btree_split_args { + struct xfs_btree_cur *cur; + int level; + union xfs_btree_ptr *ptrp; + union xfs_btree_key *key; + struct xfs_btree_cur **curp; + int *stat; /* success/failure */ + int result; + bool kswapd; /* allocation in kswapd context */ + struct completion *done; + struct work_struct work; +}; + +/* + * Stack switching interfaces for allocation + */ +static void +xfs_btree_split_worker( + struct work_struct *work) +{ + struct xfs_btree_split_args *args = container_of(work, + struct xfs_btree_split_args, work); + unsigned long pflags; + unsigned long new_pflags = PF_FSTRANS; + + /* + * we are in a transaction context here, but may also be doing work + * in kswapd context, and hence we may need to inherit that state + * temporarily to ensure that we don't block waiting for memory reclaim + * in any way. + */ + if (args->kswapd) + new_pflags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; + + current_set_flags_nested(&pflags, new_pflags); + + args->result = __xfs_btree_split(args->cur, args->level, args->ptrp, + args->key, args->curp, args->stat); + complete(args->done); + + current_restore_flags_nested(&pflags, new_pflags); +} + +/* + * BMBT split requests often come in with little stack to work on. Push + * them off to a worker thread so there is lots of stack to use. For the other + * btree types, just call directly to avoid the context switch overhead here. + */ +STATIC int /* error */ +xfs_btree_split( + struct xfs_btree_cur *cur, + int level, + union xfs_btree_ptr *ptrp, + union xfs_btree_key *key, + struct xfs_btree_cur **curp, + int *stat) /* success/failure */ +{ + struct xfs_btree_split_args args; + DECLARE_COMPLETION_ONSTACK(done); + + if (cur->bc_btnum != XFS_BTNUM_BMAP) + return __xfs_btree_split(cur, level, ptrp, key, curp, stat); + + args.cur = cur; + args.level = level; + args.ptrp = ptrp; + args.key = key; + args.curp = curp; + args.stat = stat; + args.done = &done; + args.kswapd = current_is_kswapd(); + INIT_WORK_ONSTACK(&args.work, xfs_btree_split_worker); + queue_work(xfs_alloc_wq, &args.work); + wait_for_completion(&done); + destroy_work_on_stack(&args.work); + return args.result; +} + + +/* + * Copy the old inode root contents into a real block and make the + * broot point to it. + */ +int /* error */ +xfs_btree_new_iroot( + struct xfs_btree_cur *cur, /* btree cursor */ + int *logflags, /* logging flags for inode */ + int *stat) /* return status - 0 fail */ +{ + struct xfs_buf *cbp; /* buffer for cblock */ + struct xfs_btree_block *block; /* btree block */ + struct xfs_btree_block *cblock; /* child btree block */ + union xfs_btree_key *ckp; /* child key pointer */ + union xfs_btree_ptr *cpp; /* child ptr pointer */ + union xfs_btree_key *kp; /* pointer to btree key */ + union xfs_btree_ptr *pp; /* pointer to block addr */ + union xfs_btree_ptr nptr; /* new block addr */ + int level; /* btree level */ + int error; /* error return code */ +#ifdef DEBUG + int i; /* loop counter */ +#endif + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_STATS_INC(cur, newroot); + + ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE); + + level = cur->bc_nlevels - 1; + + block = xfs_btree_get_iroot(cur); + pp = xfs_btree_ptr_addr(cur, 1, block); + + /* Allocate the new block. If we can't do it, we're toast. Give up. */ + error = cur->bc_ops->alloc_block(cur, pp, &nptr, stat); + if (error) + goto error0; + if (*stat == 0) { + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + return 0; + } + XFS_BTREE_STATS_INC(cur, alloc); + + /* Copy the root into a real block. */ + error = xfs_btree_get_buf_block(cur, &nptr, 0, &cblock, &cbp); + if (error) + goto error0; + + /* + * we can't just memcpy() the root in for CRC enabled btree blocks. + * In that case have to also ensure the blkno remains correct + */ + memcpy(cblock, block, xfs_btree_block_len(cur)); + if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) { + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + cblock->bb_u.l.bb_blkno = cpu_to_be64(cbp->b_bn); + else + cblock->bb_u.s.bb_blkno = cpu_to_be64(cbp->b_bn); + } + + be16_add_cpu(&block->bb_level, 1); + xfs_btree_set_numrecs(block, 1); + cur->bc_nlevels++; + cur->bc_ptrs[level + 1] = 1; + + kp = xfs_btree_key_addr(cur, 1, block); + ckp = xfs_btree_key_addr(cur, 1, cblock); + xfs_btree_copy_keys(cur, ckp, kp, xfs_btree_get_numrecs(cblock)); + + cpp = xfs_btree_ptr_addr(cur, 1, cblock); +#ifdef DEBUG + for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) { + error = xfs_btree_check_ptr(cur, pp, i, level); + if (error) + goto error0; + } +#endif + xfs_btree_copy_ptrs(cur, cpp, pp, xfs_btree_get_numrecs(cblock)); + +#ifdef DEBUG + error = xfs_btree_check_ptr(cur, &nptr, 0, level); + if (error) + goto error0; +#endif + xfs_btree_copy_ptrs(cur, pp, &nptr, 1); + + xfs_iroot_realloc(cur->bc_private.b.ip, + 1 - xfs_btree_get_numrecs(cblock), + cur->bc_private.b.whichfork); + + xfs_btree_setbuf(cur, level, cbp); + + /* + * Do all this logging at the end so that + * the root is at the right level. + */ + xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS); + xfs_btree_log_keys(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs)); + xfs_btree_log_ptrs(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs)); + + *logflags |= + XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork); + *stat = 1; + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + return 0; +error0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; +} + +/* + * Allocate a new root block, fill it in. + */ +STATIC int /* error */ +xfs_btree_new_root( + struct xfs_btree_cur *cur, /* btree cursor */ + int *stat) /* success/failure */ +{ + struct xfs_btree_block *block; /* one half of the old root block */ + struct xfs_buf *bp; /* buffer containing block */ + int error; /* error return value */ + struct xfs_buf *lbp; /* left buffer pointer */ + struct xfs_btree_block *left; /* left btree block */ + struct xfs_buf *nbp; /* new (root) buffer */ + struct xfs_btree_block *new; /* new (root) btree block */ + int nptr; /* new value for key index, 1 or 2 */ + struct xfs_buf *rbp; /* right buffer pointer */ + struct xfs_btree_block *right; /* right btree block */ + union xfs_btree_ptr rptr; + union xfs_btree_ptr lptr; + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_STATS_INC(cur, newroot); + + /* initialise our start point from the cursor */ + cur->bc_ops->init_ptr_from_cur(cur, &rptr); + + /* Allocate the new block. If we can't do it, we're toast. Give up. */ + error = cur->bc_ops->alloc_block(cur, &rptr, &lptr, stat); + if (error) + goto error0; + if (*stat == 0) + goto out0; + XFS_BTREE_STATS_INC(cur, alloc); + + /* Set up the new block. */ + error = xfs_btree_get_buf_block(cur, &lptr, 0, &new, &nbp); + if (error) + goto error0; + + /* Set the root in the holding structure increasing the level by 1. */ + cur->bc_ops->set_root(cur, &lptr, 1); + + /* + * At the previous root level there are now two blocks: the old root, + * and the new block generated when it was split. We don't know which + * one the cursor is pointing at, so we set up variables "left" and + * "right" for each case. + */ + block = xfs_btree_get_block(cur, cur->bc_nlevels - 1, &bp); + +#ifdef DEBUG + error = xfs_btree_check_block(cur, block, cur->bc_nlevels - 1, bp); + if (error) + goto error0; +#endif + + xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB); + if (!xfs_btree_ptr_is_null(cur, &rptr)) { + /* Our block is left, pick up the right block. */ + lbp = bp; + xfs_btree_buf_to_ptr(cur, lbp, &lptr); + left = block; + error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp); + if (error) + goto error0; + bp = rbp; + nptr = 1; + } else { + /* Our block is right, pick up the left block. */ + rbp = bp; + xfs_btree_buf_to_ptr(cur, rbp, &rptr); + right = block; + xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB); + error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp); + if (error) + goto error0; + bp = lbp; + nptr = 2; + } + /* Fill in the new block's btree header and log it. */ + xfs_btree_init_block_cur(cur, nbp, cur->bc_nlevels, 2); + xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS); + ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) && + !xfs_btree_ptr_is_null(cur, &rptr)); + + /* Fill in the key data in the new root. */ + if (xfs_btree_get_level(left) > 0) { + xfs_btree_copy_keys(cur, + xfs_btree_key_addr(cur, 1, new), + xfs_btree_key_addr(cur, 1, left), 1); + xfs_btree_copy_keys(cur, + xfs_btree_key_addr(cur, 2, new), + xfs_btree_key_addr(cur, 1, right), 1); + } else { + cur->bc_ops->init_key_from_rec( + xfs_btree_key_addr(cur, 1, new), + xfs_btree_rec_addr(cur, 1, left)); + cur->bc_ops->init_key_from_rec( + xfs_btree_key_addr(cur, 2, new), + xfs_btree_rec_addr(cur, 1, right)); + } + xfs_btree_log_keys(cur, nbp, 1, 2); + + /* Fill in the pointer data in the new root. */ + xfs_btree_copy_ptrs(cur, + xfs_btree_ptr_addr(cur, 1, new), &lptr, 1); + xfs_btree_copy_ptrs(cur, + xfs_btree_ptr_addr(cur, 2, new), &rptr, 1); + xfs_btree_log_ptrs(cur, nbp, 1, 2); + + /* Fix up the cursor. */ + xfs_btree_setbuf(cur, cur->bc_nlevels, nbp); + cur->bc_ptrs[cur->bc_nlevels] = nptr; + cur->bc_nlevels++; + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 1; + return 0; +error0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; +out0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 0; + return 0; +} + +STATIC int +xfs_btree_make_block_unfull( + struct xfs_btree_cur *cur, /* btree cursor */ + int level, /* btree level */ + int numrecs,/* # of recs in block */ + int *oindex,/* old tree index */ + int *index, /* new tree index */ + union xfs_btree_ptr *nptr, /* new btree ptr */ + struct xfs_btree_cur **ncur, /* new btree cursor */ + union xfs_btree_rec *nrec, /* new record */ + int *stat) +{ + union xfs_btree_key key; /* new btree key value */ + int error = 0; + + if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && + level == cur->bc_nlevels - 1) { + struct xfs_inode *ip = cur->bc_private.b.ip; + + if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) { + /* A root block that can be made bigger. */ + xfs_iroot_realloc(ip, 1, cur->bc_private.b.whichfork); + } else { + /* A root block that needs replacing */ + int logflags = 0; + + error = xfs_btree_new_iroot(cur, &logflags, stat); + if (error || *stat == 0) + return error; + + xfs_trans_log_inode(cur->bc_tp, ip, logflags); + } + + return 0; + } + + /* First, try shifting an entry to the right neighbor. */ + error = xfs_btree_rshift(cur, level, stat); + if (error || *stat) + return error; + + /* Next, try shifting an entry to the left neighbor. */ + error = xfs_btree_lshift(cur, level, stat); + if (error) + return error; + + if (*stat) { + *oindex = *index = cur->bc_ptrs[level]; + return 0; + } + + /* + * Next, try splitting the current block in half. + * + * If this works we have to re-set our variables because we + * could be in a different block now. + */ + error = xfs_btree_split(cur, level, nptr, &key, ncur, stat); + if (error || *stat == 0) + return error; + + + *index = cur->bc_ptrs[level]; + cur->bc_ops->init_rec_from_key(&key, nrec); + return 0; +} + +/* + * Insert one record/level. Return information to the caller + * allowing the next level up to proceed if necessary. + */ +STATIC int +xfs_btree_insrec( + struct xfs_btree_cur *cur, /* btree cursor */ + int level, /* level to insert record at */ + union xfs_btree_ptr *ptrp, /* i/o: block number inserted */ + union xfs_btree_rec *recp, /* i/o: record data inserted */ + struct xfs_btree_cur **curp, /* output: new cursor replacing cur */ + int *stat) /* success/failure */ +{ + struct xfs_btree_block *block; /* btree block */ + struct xfs_buf *bp; /* buffer for block */ + union xfs_btree_key key; /* btree key */ + union xfs_btree_ptr nptr; /* new block ptr */ + struct xfs_btree_cur *ncur; /* new btree cursor */ + union xfs_btree_rec nrec; /* new record count */ + int optr; /* old key/record index */ + int ptr; /* key/record index */ + int numrecs;/* number of records */ + int error; /* error return value */ +#ifdef DEBUG + int i; +#endif + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_TRACE_ARGIPR(cur, level, *ptrp, recp); + + ncur = NULL; + + /* + * If we have an external root pointer, and we've made it to the + * root level, allocate a new root block and we're done. + */ + if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && + (level >= cur->bc_nlevels)) { + error = xfs_btree_new_root(cur, stat); + xfs_btree_set_ptr_null(cur, ptrp); + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + return error; + } + + /* If we're off the left edge, return failure. */ + ptr = cur->bc_ptrs[level]; + if (ptr == 0) { + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 0; + return 0; + } + + /* Make a key out of the record data to be inserted, and save it. */ + cur->bc_ops->init_key_from_rec(&key, recp); + + optr = ptr; + + XFS_BTREE_STATS_INC(cur, insrec); + + /* Get pointers to the btree buffer and block. */ + block = xfs_btree_get_block(cur, level, &bp); + numrecs = xfs_btree_get_numrecs(block); + +#ifdef DEBUG + error = xfs_btree_check_block(cur, block, level, bp); + if (error) + goto error0; + + /* Check that the new entry is being inserted in the right place. */ + if (ptr <= numrecs) { + if (level == 0) { + ASSERT(cur->bc_ops->recs_inorder(cur, recp, + xfs_btree_rec_addr(cur, ptr, block))); + } else { + ASSERT(cur->bc_ops->keys_inorder(cur, &key, + xfs_btree_key_addr(cur, ptr, block))); + } + } +#endif + + /* + * If the block is full, we can't insert the new entry until we + * make the block un-full. + */ + xfs_btree_set_ptr_null(cur, &nptr); + if (numrecs == cur->bc_ops->get_maxrecs(cur, level)) { + error = xfs_btree_make_block_unfull(cur, level, numrecs, + &optr, &ptr, &nptr, &ncur, &nrec, stat); + if (error || *stat == 0) + goto error0; + } + + /* + * The current block may have changed if the block was + * previously full and we have just made space in it. + */ + block = xfs_btree_get_block(cur, level, &bp); + numrecs = xfs_btree_get_numrecs(block); + +#ifdef DEBUG + error = xfs_btree_check_block(cur, block, level, bp); + if (error) + return error; +#endif + + /* + * At this point we know there's room for our new entry in the block + * we're pointing at. + */ + XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr + 1); + + if (level > 0) { + /* It's a nonleaf. make a hole in the keys and ptrs */ + union xfs_btree_key *kp; + union xfs_btree_ptr *pp; + + kp = xfs_btree_key_addr(cur, ptr, block); + pp = xfs_btree_ptr_addr(cur, ptr, block); + +#ifdef DEBUG + for (i = numrecs - ptr; i >= 0; i--) { + error = xfs_btree_check_ptr(cur, pp, i, level); + if (error) + return error; + } +#endif + + xfs_btree_shift_keys(cur, kp, 1, numrecs - ptr + 1); + xfs_btree_shift_ptrs(cur, pp, 1, numrecs - ptr + 1); + +#ifdef DEBUG + error = xfs_btree_check_ptr(cur, ptrp, 0, level); + if (error) + goto error0; +#endif + + /* Now put the new data in, bump numrecs and log it. */ + xfs_btree_copy_keys(cur, kp, &key, 1); + xfs_btree_copy_ptrs(cur, pp, ptrp, 1); + numrecs++; + xfs_btree_set_numrecs(block, numrecs); + xfs_btree_log_ptrs(cur, bp, ptr, numrecs); + xfs_btree_log_keys(cur, bp, ptr, numrecs); +#ifdef DEBUG + if (ptr < numrecs) { + ASSERT(cur->bc_ops->keys_inorder(cur, kp, + xfs_btree_key_addr(cur, ptr + 1, block))); + } +#endif + } else { + /* It's a leaf. make a hole in the records */ + union xfs_btree_rec *rp; + + rp = xfs_btree_rec_addr(cur, ptr, block); + + xfs_btree_shift_recs(cur, rp, 1, numrecs - ptr + 1); + + /* Now put the new data in, bump numrecs and log it. */ + xfs_btree_copy_recs(cur, rp, recp, 1); + xfs_btree_set_numrecs(block, ++numrecs); + xfs_btree_log_recs(cur, bp, ptr, numrecs); +#ifdef DEBUG + if (ptr < numrecs) { + ASSERT(cur->bc_ops->recs_inorder(cur, rp, + xfs_btree_rec_addr(cur, ptr + 1, block))); + } +#endif + } + + /* Log the new number of records in the btree header. */ + xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS); + + /* If we inserted at the start of a block, update the parents' keys. */ + if (optr == 1) { + error = xfs_btree_updkey(cur, &key, level + 1); + if (error) + goto error0; + } + + /* + * If we are tracking the last record in the tree and + * we are at the far right edge of the tree, update it. + */ + if (xfs_btree_is_lastrec(cur, block, level)) { + cur->bc_ops->update_lastrec(cur, block, recp, + ptr, LASTREC_INSREC); + } + + /* + * Return the new block number, if any. + * If there is one, give back a record value and a cursor too. + */ + *ptrp = nptr; + if (!xfs_btree_ptr_is_null(cur, &nptr)) { + *recp = nrec; + *curp = ncur; + } + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 1; + return 0; + +error0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; +} + +/* + * Insert the record at the point referenced by cur. + * + * A multi-level split of the tree on insert will invalidate the original + * cursor. All callers of this function should assume that the cursor is + * no longer valid and revalidate it. + */ +int +xfs_btree_insert( + struct xfs_btree_cur *cur, + int *stat) +{ + int error; /* error return value */ + int i; /* result value, 0 for failure */ + int level; /* current level number in btree */ + union xfs_btree_ptr nptr; /* new block number (split result) */ + struct xfs_btree_cur *ncur; /* new cursor (split result) */ + struct xfs_btree_cur *pcur; /* previous level's cursor */ + union xfs_btree_rec rec; /* record to insert */ + + level = 0; + ncur = NULL; + pcur = cur; + + xfs_btree_set_ptr_null(cur, &nptr); + cur->bc_ops->init_rec_from_cur(cur, &rec); + + /* + * Loop going up the tree, starting at the leaf level. + * Stop when we don't get a split block, that must mean that + * the insert is finished with this level. + */ + do { + /* + * Insert nrec/nptr into this level of the tree. + * Note if we fail, nptr will be null. + */ + error = xfs_btree_insrec(pcur, level, &nptr, &rec, &ncur, &i); + if (error) { + if (pcur != cur) + xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR); + goto error0; + } + + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0); + level++; + + /* + * See if the cursor we just used is trash. + * Can't trash the caller's cursor, but otherwise we should + * if ncur is a new cursor or we're about to be done. + */ + if (pcur != cur && + (ncur || xfs_btree_ptr_is_null(cur, &nptr))) { + /* Save the state from the cursor before we trash it */ + if (cur->bc_ops->update_cursor) + cur->bc_ops->update_cursor(pcur, cur); + cur->bc_nlevels = pcur->bc_nlevels; + xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR); + } + /* If we got a new cursor, switch to it. */ + if (ncur) { + pcur = ncur; + ncur = NULL; + } + } while (!xfs_btree_ptr_is_null(cur, &nptr)); + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = i; + return 0; +error0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; +} + +/* + * Try to merge a non-leaf block back into the inode root. + * + * Note: the killroot names comes from the fact that we're effectively + * killing the old root block. But because we can't just delete the + * inode we have to copy the single block it was pointing to into the + * inode. + */ +STATIC int +xfs_btree_kill_iroot( + struct xfs_btree_cur *cur) +{ + int whichfork = cur->bc_private.b.whichfork; + struct xfs_inode *ip = cur->bc_private.b.ip; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_btree_block *block; + struct xfs_btree_block *cblock; + union xfs_btree_key *kp; + union xfs_btree_key *ckp; + union xfs_btree_ptr *pp; + union xfs_btree_ptr *cpp; + struct xfs_buf *cbp; + int level; + int index; + int numrecs; +#ifdef DEBUG + union xfs_btree_ptr ptr; + int i; +#endif + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + + ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE); + ASSERT(cur->bc_nlevels > 1); + + /* + * Don't deal with the root block needs to be a leaf case. + * We're just going to turn the thing back into extents anyway. + */ + level = cur->bc_nlevels - 1; + if (level == 1) + goto out0; + + /* + * Give up if the root has multiple children. + */ + block = xfs_btree_get_iroot(cur); + if (xfs_btree_get_numrecs(block) != 1) + goto out0; + + cblock = xfs_btree_get_block(cur, level - 1, &cbp); + numrecs = xfs_btree_get_numrecs(cblock); + + /* + * Only do this if the next level will fit. + * Then the data must be copied up to the inode, + * instead of freeing the root you free the next level. + */ + if (numrecs > cur->bc_ops->get_dmaxrecs(cur, level)) + goto out0; + + XFS_BTREE_STATS_INC(cur, killroot); + +#ifdef DEBUG + xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB); + ASSERT(xfs_btree_ptr_is_null(cur, &ptr)); + xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB); + ASSERT(xfs_btree_ptr_is_null(cur, &ptr)); +#endif + + index = numrecs - cur->bc_ops->get_maxrecs(cur, level); + if (index) { + xfs_iroot_realloc(cur->bc_private.b.ip, index, + cur->bc_private.b.whichfork); + block = ifp->if_broot; + } + + be16_add_cpu(&block->bb_numrecs, index); + ASSERT(block->bb_numrecs == cblock->bb_numrecs); + + kp = xfs_btree_key_addr(cur, 1, block); + ckp = xfs_btree_key_addr(cur, 1, cblock); + xfs_btree_copy_keys(cur, kp, ckp, numrecs); + + pp = xfs_btree_ptr_addr(cur, 1, block); + cpp = xfs_btree_ptr_addr(cur, 1, cblock); +#ifdef DEBUG + for (i = 0; i < numrecs; i++) { + int error; + + error = xfs_btree_check_ptr(cur, cpp, i, level - 1); + if (error) { + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; + } + } +#endif + xfs_btree_copy_ptrs(cur, pp, cpp, numrecs); + + cur->bc_ops->free_block(cur, cbp); + XFS_BTREE_STATS_INC(cur, free); + + cur->bc_bufs[level - 1] = NULL; + be16_add_cpu(&block->bb_level, -1); + xfs_trans_log_inode(cur->bc_tp, ip, + XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork)); + cur->bc_nlevels--; +out0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + return 0; +} + +/* + * Kill the current root node, and replace it with it's only child node. + */ +STATIC int +xfs_btree_kill_root( + struct xfs_btree_cur *cur, + struct xfs_buf *bp, + int level, + union xfs_btree_ptr *newroot) +{ + int error; + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_STATS_INC(cur, killroot); + + /* + * Update the root pointer, decreasing the level by 1 and then + * free the old root. + */ + cur->bc_ops->set_root(cur, newroot, -1); + + error = cur->bc_ops->free_block(cur, bp); + if (error) { + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; + } + + XFS_BTREE_STATS_INC(cur, free); + + cur->bc_bufs[level] = NULL; + cur->bc_ra[level] = 0; + cur->bc_nlevels--; + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + return 0; +} + +STATIC int +xfs_btree_dec_cursor( + struct xfs_btree_cur *cur, + int level, + int *stat) +{ + int error; + int i; + + if (level > 0) { + error = xfs_btree_decrement(cur, level, &i); + if (error) + return error; + } + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 1; + return 0; +} + +/* + * Single level of the btree record deletion routine. + * Delete record pointed to by cur/level. + * Remove the record from its block then rebalance the tree. + * Return 0 for error, 1 for done, 2 to go on to the next level. + */ +STATIC int /* error */ +xfs_btree_delrec( + struct xfs_btree_cur *cur, /* btree cursor */ + int level, /* level removing record from */ + int *stat) /* fail/done/go-on */ +{ + struct xfs_btree_block *block; /* btree block */ + union xfs_btree_ptr cptr; /* current block ptr */ + struct xfs_buf *bp; /* buffer for block */ + int error; /* error return value */ + int i; /* loop counter */ + union xfs_btree_key key; /* storage for keyp */ + union xfs_btree_key *keyp = &key; /* passed to the next level */ + union xfs_btree_ptr lptr; /* left sibling block ptr */ + struct xfs_buf *lbp; /* left buffer pointer */ + struct xfs_btree_block *left; /* left btree block */ + int lrecs = 0; /* left record count */ + int ptr; /* key/record index */ + union xfs_btree_ptr rptr; /* right sibling block ptr */ + struct xfs_buf *rbp; /* right buffer pointer */ + struct xfs_btree_block *right; /* right btree block */ + struct xfs_btree_block *rrblock; /* right-right btree block */ + struct xfs_buf *rrbp; /* right-right buffer pointer */ + int rrecs = 0; /* right record count */ + struct xfs_btree_cur *tcur; /* temporary btree cursor */ + int numrecs; /* temporary numrec count */ + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_TRACE_ARGI(cur, level); + + tcur = NULL; + + /* Get the index of the entry being deleted, check for nothing there. */ + ptr = cur->bc_ptrs[level]; + if (ptr == 0) { + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 0; + return 0; + } + + /* Get the buffer & block containing the record or key/ptr. */ + block = xfs_btree_get_block(cur, level, &bp); + numrecs = xfs_btree_get_numrecs(block); + +#ifdef DEBUG + error = xfs_btree_check_block(cur, block, level, bp); + if (error) + goto error0; +#endif + + /* Fail if we're off the end of the block. */ + if (ptr > numrecs) { + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 0; + return 0; + } + + XFS_BTREE_STATS_INC(cur, delrec); + XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr); + + /* Excise the entries being deleted. */ + if (level > 0) { + /* It's a nonleaf. operate on keys and ptrs */ + union xfs_btree_key *lkp; + union xfs_btree_ptr *lpp; + + lkp = xfs_btree_key_addr(cur, ptr + 1, block); + lpp = xfs_btree_ptr_addr(cur, ptr + 1, block); + +#ifdef DEBUG + for (i = 0; i < numrecs - ptr; i++) { + error = xfs_btree_check_ptr(cur, lpp, i, level); + if (error) + goto error0; + } +#endif + + if (ptr < numrecs) { + xfs_btree_shift_keys(cur, lkp, -1, numrecs - ptr); + xfs_btree_shift_ptrs(cur, lpp, -1, numrecs - ptr); + xfs_btree_log_keys(cur, bp, ptr, numrecs - 1); + xfs_btree_log_ptrs(cur, bp, ptr, numrecs - 1); + } + + /* + * If it's the first record in the block, we'll need to pass a + * key up to the next level (updkey). + */ + if (ptr == 1) + keyp = xfs_btree_key_addr(cur, 1, block); + } else { + /* It's a leaf. operate on records */ + if (ptr < numrecs) { + xfs_btree_shift_recs(cur, + xfs_btree_rec_addr(cur, ptr + 1, block), + -1, numrecs - ptr); + xfs_btree_log_recs(cur, bp, ptr, numrecs - 1); + } + + /* + * If it's the first record in the block, we'll need a key + * structure to pass up to the next level (updkey). + */ + if (ptr == 1) { + cur->bc_ops->init_key_from_rec(&key, + xfs_btree_rec_addr(cur, 1, block)); + keyp = &key; + } + } + + /* + * Decrement and log the number of entries in the block. + */ + xfs_btree_set_numrecs(block, --numrecs); + xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS); + + /* + * If we are tracking the last record in the tree and + * we are at the far right edge of the tree, update it. + */ + if (xfs_btree_is_lastrec(cur, block, level)) { + cur->bc_ops->update_lastrec(cur, block, NULL, + ptr, LASTREC_DELREC); + } + + /* + * We're at the root level. First, shrink the root block in-memory. + * Try to get rid of the next level down. If we can't then there's + * nothing left to do. + */ + if (level == cur->bc_nlevels - 1) { + if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) { + xfs_iroot_realloc(cur->bc_private.b.ip, -1, + cur->bc_private.b.whichfork); + + error = xfs_btree_kill_iroot(cur); + if (error) + goto error0; + + error = xfs_btree_dec_cursor(cur, level, stat); + if (error) + goto error0; + *stat = 1; + return 0; + } + + /* + * If this is the root level, and there's only one entry left, + * and it's NOT the leaf level, then we can get rid of this + * level. + */ + if (numrecs == 1 && level > 0) { + union xfs_btree_ptr *pp; + /* + * pp is still set to the first pointer in the block. + * Make it the new root of the btree. + */ + pp = xfs_btree_ptr_addr(cur, 1, block); + error = xfs_btree_kill_root(cur, bp, level, pp); + if (error) + goto error0; + } else if (level > 0) { + error = xfs_btree_dec_cursor(cur, level, stat); + if (error) + goto error0; + } + *stat = 1; + return 0; + } + + /* + * If we deleted the leftmost entry in the block, update the + * key values above us in the tree. + */ + if (ptr == 1) { + error = xfs_btree_updkey(cur, keyp, level + 1); + if (error) + goto error0; + } + + /* + * If the number of records remaining in the block is at least + * the minimum, we're done. + */ + if (numrecs >= cur->bc_ops->get_minrecs(cur, level)) { + error = xfs_btree_dec_cursor(cur, level, stat); + if (error) + goto error0; + return 0; + } + + /* + * Otherwise, we have to move some records around to keep the + * tree balanced. Look at the left and right sibling blocks to + * see if we can re-balance by moving only one record. + */ + xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB); + xfs_btree_get_sibling(cur, block, &lptr, XFS_BB_LEFTSIB); + + if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) { + /* + * One child of root, need to get a chance to copy its contents + * into the root and delete it. Can't go up to next level, + * there's nothing to delete there. + */ + if (xfs_btree_ptr_is_null(cur, &rptr) && + xfs_btree_ptr_is_null(cur, &lptr) && + level == cur->bc_nlevels - 2) { + error = xfs_btree_kill_iroot(cur); + if (!error) + error = xfs_btree_dec_cursor(cur, level, stat); + if (error) + goto error0; + return 0; + } + } + + ASSERT(!xfs_btree_ptr_is_null(cur, &rptr) || + !xfs_btree_ptr_is_null(cur, &lptr)); + + /* + * Duplicate the cursor so our btree manipulations here won't + * disrupt the next level up. + */ + error = xfs_btree_dup_cursor(cur, &tcur); + if (error) + goto error0; + + /* + * If there's a right sibling, see if it's ok to shift an entry + * out of it. + */ + if (!xfs_btree_ptr_is_null(cur, &rptr)) { + /* + * Move the temp cursor to the last entry in the next block. + * Actually any entry but the first would suffice. + */ + i = xfs_btree_lastrec(tcur, level); + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0); + + error = xfs_btree_increment(tcur, level, &i); + if (error) + goto error0; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0); + + i = xfs_btree_lastrec(tcur, level); + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0); + + /* Grab a pointer to the block. */ + right = xfs_btree_get_block(tcur, level, &rbp); +#ifdef DEBUG + error = xfs_btree_check_block(tcur, right, level, rbp); + if (error) + goto error0; +#endif + /* Grab the current block number, for future use. */ + xfs_btree_get_sibling(tcur, right, &cptr, XFS_BB_LEFTSIB); + + /* + * If right block is full enough so that removing one entry + * won't make it too empty, and left-shifting an entry out + * of right to us works, we're done. + */ + if (xfs_btree_get_numrecs(right) - 1 >= + cur->bc_ops->get_minrecs(tcur, level)) { + error = xfs_btree_lshift(tcur, level, &i); + if (error) + goto error0; + if (i) { + ASSERT(xfs_btree_get_numrecs(block) >= + cur->bc_ops->get_minrecs(tcur, level)); + + xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); + tcur = NULL; + + error = xfs_btree_dec_cursor(cur, level, stat); + if (error) + goto error0; + return 0; + } + } + + /* + * Otherwise, grab the number of records in right for + * future reference, and fix up the temp cursor to point + * to our block again (last record). + */ + rrecs = xfs_btree_get_numrecs(right); + if (!xfs_btree_ptr_is_null(cur, &lptr)) { + i = xfs_btree_firstrec(tcur, level); + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0); + + error = xfs_btree_decrement(tcur, level, &i); + if (error) + goto error0; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0); + } + } + + /* + * If there's a left sibling, see if it's ok to shift an entry + * out of it. + */ + if (!xfs_btree_ptr_is_null(cur, &lptr)) { + /* + * Move the temp cursor to the first entry in the + * previous block. + */ + i = xfs_btree_firstrec(tcur, level); + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0); + + error = xfs_btree_decrement(tcur, level, &i); + if (error) + goto error0; + i = xfs_btree_firstrec(tcur, level); + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0); + + /* Grab a pointer to the block. */ + left = xfs_btree_get_block(tcur, level, &lbp); +#ifdef DEBUG + error = xfs_btree_check_block(cur, left, level, lbp); + if (error) + goto error0; +#endif + /* Grab the current block number, for future use. */ + xfs_btree_get_sibling(tcur, left, &cptr, XFS_BB_RIGHTSIB); + + /* + * If left block is full enough so that removing one entry + * won't make it too empty, and right-shifting an entry out + * of left to us works, we're done. + */ + if (xfs_btree_get_numrecs(left) - 1 >= + cur->bc_ops->get_minrecs(tcur, level)) { + error = xfs_btree_rshift(tcur, level, &i); + if (error) + goto error0; + if (i) { + ASSERT(xfs_btree_get_numrecs(block) >= + cur->bc_ops->get_minrecs(tcur, level)); + xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); + tcur = NULL; + if (level == 0) + cur->bc_ptrs[0]++; + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 1; + return 0; + } + } + + /* + * Otherwise, grab the number of records in right for + * future reference. + */ + lrecs = xfs_btree_get_numrecs(left); + } + + /* Delete the temp cursor, we're done with it. */ + xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); + tcur = NULL; + + /* If here, we need to do a join to keep the tree balanced. */ + ASSERT(!xfs_btree_ptr_is_null(cur, &cptr)); + + if (!xfs_btree_ptr_is_null(cur, &lptr) && + lrecs + xfs_btree_get_numrecs(block) <= + cur->bc_ops->get_maxrecs(cur, level)) { + /* + * Set "right" to be the starting block, + * "left" to be the left neighbor. + */ + rptr = cptr; + right = block; + rbp = bp; + error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp); + if (error) + goto error0; + + /* + * If that won't work, see if we can join with the right neighbor block. + */ + } else if (!xfs_btree_ptr_is_null(cur, &rptr) && + rrecs + xfs_btree_get_numrecs(block) <= + cur->bc_ops->get_maxrecs(cur, level)) { + /* + * Set "left" to be the starting block, + * "right" to be the right neighbor. + */ + lptr = cptr; + left = block; + lbp = bp; + error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp); + if (error) + goto error0; + + /* + * Otherwise, we can't fix the imbalance. + * Just return. This is probably a logic error, but it's not fatal. + */ + } else { + error = xfs_btree_dec_cursor(cur, level, stat); + if (error) + goto error0; + return 0; + } + + rrecs = xfs_btree_get_numrecs(right); + lrecs = xfs_btree_get_numrecs(left); + + /* + * We're now going to join "left" and "right" by moving all the stuff + * in "right" to "left" and deleting "right". + */ + XFS_BTREE_STATS_ADD(cur, moves, rrecs); + if (level > 0) { + /* It's a non-leaf. Move keys and pointers. */ + union xfs_btree_key *lkp; /* left btree key */ + union xfs_btree_ptr *lpp; /* left address pointer */ + union xfs_btree_key *rkp; /* right btree key */ + union xfs_btree_ptr *rpp; /* right address pointer */ + + lkp = xfs_btree_key_addr(cur, lrecs + 1, left); + lpp = xfs_btree_ptr_addr(cur, lrecs + 1, left); + rkp = xfs_btree_key_addr(cur, 1, right); + rpp = xfs_btree_ptr_addr(cur, 1, right); +#ifdef DEBUG + for (i = 1; i < rrecs; i++) { + error = xfs_btree_check_ptr(cur, rpp, i, level); + if (error) + goto error0; + } +#endif + xfs_btree_copy_keys(cur, lkp, rkp, rrecs); + xfs_btree_copy_ptrs(cur, lpp, rpp, rrecs); + + xfs_btree_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs); + xfs_btree_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs); + } else { + /* It's a leaf. Move records. */ + union xfs_btree_rec *lrp; /* left record pointer */ + union xfs_btree_rec *rrp; /* right record pointer */ + + lrp = xfs_btree_rec_addr(cur, lrecs + 1, left); + rrp = xfs_btree_rec_addr(cur, 1, right); + + xfs_btree_copy_recs(cur, lrp, rrp, rrecs); + xfs_btree_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs); + } + + XFS_BTREE_STATS_INC(cur, join); + + /* + * Fix up the number of records and right block pointer in the + * surviving block, and log it. + */ + xfs_btree_set_numrecs(left, lrecs + rrecs); + xfs_btree_get_sibling(cur, right, &cptr, XFS_BB_RIGHTSIB), + xfs_btree_set_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB); + xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB); + + /* If there is a right sibling, point it to the remaining block. */ + xfs_btree_get_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB); + if (!xfs_btree_ptr_is_null(cur, &cptr)) { + error = xfs_btree_read_buf_block(cur, &cptr, 0, &rrblock, &rrbp); + if (error) + goto error0; + xfs_btree_set_sibling(cur, rrblock, &lptr, XFS_BB_LEFTSIB); + xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB); + } + + /* Free the deleted block. */ + error = cur->bc_ops->free_block(cur, rbp); + if (error) + goto error0; + XFS_BTREE_STATS_INC(cur, free); + + /* + * If we joined with the left neighbor, set the buffer in the + * cursor to the left block, and fix up the index. + */ + if (bp != lbp) { + cur->bc_bufs[level] = lbp; + cur->bc_ptrs[level] += lrecs; + cur->bc_ra[level] = 0; + } + /* + * If we joined with the right neighbor and there's a level above + * us, increment the cursor at that level. + */ + else if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) || + (level + 1 < cur->bc_nlevels)) { + error = xfs_btree_increment(cur, level + 1, &i); + if (error) + goto error0; + } + + /* + * Readjust the ptr at this level if it's not a leaf, since it's + * still pointing at the deletion point, which makes the cursor + * inconsistent. If this makes the ptr 0, the caller fixes it up. + * We can't use decrement because it would change the next level up. + */ + if (level > 0) + cur->bc_ptrs[level]--; + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + /* Return value means the next level up has something to do. */ + *stat = 2; + return 0; + +error0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + if (tcur) + xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); + return error; +} + +/* + * Delete the record pointed to by cur. + * The cursor refers to the place where the record was (could be inserted) + * when the operation returns. + */ +int /* error */ +xfs_btree_delete( + struct xfs_btree_cur *cur, + int *stat) /* success/failure */ +{ + int error; /* error return value */ + int level; + int i; + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + + /* + * Go up the tree, starting at leaf level. + * + * If 2 is returned then a join was done; go to the next level. + * Otherwise we are done. + */ + for (level = 0, i = 2; i == 2; level++) { + error = xfs_btree_delrec(cur, level, &i); + if (error) + goto error0; + } + + if (i == 0) { + for (level = 1; level < cur->bc_nlevels; level++) { + if (cur->bc_ptrs[level] == 0) { + error = xfs_btree_decrement(cur, level, &i); + if (error) + goto error0; + break; + } + } + } + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = i; + return 0; +error0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; +} + +/* + * Get the data from the pointed-to record. + */ +int /* error */ +xfs_btree_get_rec( + struct xfs_btree_cur *cur, /* btree cursor */ + union xfs_btree_rec **recp, /* output: btree record */ + int *stat) /* output: success/failure */ +{ + struct xfs_btree_block *block; /* btree block */ + struct xfs_buf *bp; /* buffer pointer */ + int ptr; /* record number */ +#ifdef DEBUG + int error; /* error return value */ +#endif + + ptr = cur->bc_ptrs[0]; + block = xfs_btree_get_block(cur, 0, &bp); + +#ifdef DEBUG + error = xfs_btree_check_block(cur, block, 0, bp); + if (error) + return error; +#endif + + /* + * Off the right end or left end, return failure. + */ + if (ptr > xfs_btree_get_numrecs(block) || ptr <= 0) { + *stat = 0; + return 0; + } + + /* + * Point to the record and extract its data. + */ + *recp = xfs_btree_rec_addr(cur, ptr, block); + *stat = 1; + return 0; +} + +/* + * Change the owner of a btree. + * + * The mechanism we use here is ordered buffer logging. Because we don't know + * how many buffers were are going to need to modify, we don't really want to + * have to make transaction reservations for the worst case of every buffer in a + * full size btree as that may be more space that we can fit in the log.... + * + * We do the btree walk in the most optimal manner possible - we have sibling + * pointers so we can just walk all the blocks on each level from left to right + * in a single pass, and then move to the next level and do the same. We can + * also do readahead on the sibling pointers to get IO moving more quickly, + * though for slow disks this is unlikely to make much difference to performance + * as the amount of CPU work we have to do before moving to the next block is + * relatively small. + * + * For each btree block that we load, modify the owner appropriately, set the + * buffer as an ordered buffer and log it appropriately. We need to ensure that + * we mark the region we change dirty so that if the buffer is relogged in + * a subsequent transaction the changes we make here as an ordered buffer are + * correctly relogged in that transaction. If we are in recovery context, then + * just queue the modified buffer as delayed write buffer so the transaction + * recovery completion writes the changes to disk. + */ +static int +xfs_btree_block_change_owner( + struct xfs_btree_cur *cur, + int level, + __uint64_t new_owner, + struct list_head *buffer_list) +{ + struct xfs_btree_block *block; + struct xfs_buf *bp; + union xfs_btree_ptr rptr; + + /* do right sibling readahead */ + xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA); + + /* modify the owner */ + block = xfs_btree_get_block(cur, level, &bp); + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + block->bb_u.l.bb_owner = cpu_to_be64(new_owner); + else + block->bb_u.s.bb_owner = cpu_to_be32(new_owner); + + /* + * If the block is a root block hosted in an inode, we might not have a + * buffer pointer here and we shouldn't attempt to log the change as the + * information is already held in the inode and discarded when the root + * block is formatted into the on-disk inode fork. We still change it, + * though, so everything is consistent in memory. + */ + if (bp) { + if (cur->bc_tp) { + xfs_trans_ordered_buf(cur->bc_tp, bp); + xfs_btree_log_block(cur, bp, XFS_BB_OWNER); + } else { + xfs_buf_delwri_queue(bp, buffer_list); + } + } else { + ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE); + ASSERT(level == cur->bc_nlevels - 1); + } + + /* now read rh sibling block for next iteration */ + xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB); + if (xfs_btree_ptr_is_null(cur, &rptr)) + return -ENOENT; + + return xfs_btree_lookup_get_block(cur, level, &rptr, &block); +} + +int +xfs_btree_change_owner( + struct xfs_btree_cur *cur, + __uint64_t new_owner, + struct list_head *buffer_list) +{ + union xfs_btree_ptr lptr; + int level; + struct xfs_btree_block *block = NULL; + int error = 0; + + cur->bc_ops->init_ptr_from_cur(cur, &lptr); + + /* for each level */ + for (level = cur->bc_nlevels - 1; level >= 0; level--) { + /* grab the left hand block */ + error = xfs_btree_lookup_get_block(cur, level, &lptr, &block); + if (error) + return error; + + /* readahead the left most block for the next level down */ + if (level > 0) { + union xfs_btree_ptr *ptr; + + ptr = xfs_btree_ptr_addr(cur, 1, block); + xfs_btree_readahead_ptr(cur, ptr, 1); + + /* save for the next iteration of the loop */ + lptr = *ptr; + } + + /* for each buffer in the level */ + do { + error = xfs_btree_block_change_owner(cur, level, + new_owner, + buffer_list); + } while (!error); + + if (error != -ENOENT) + return error; + } + + return 0; +} diff --git a/kernel/fs/xfs/libxfs/xfs_btree.h b/kernel/fs/xfs/libxfs/xfs_btree.h new file mode 100644 index 000000000..8f18bab73 --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_btree.h @@ -0,0 +1,468 @@ +/* + * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_BTREE_H__ +#define __XFS_BTREE_H__ + +struct xfs_buf; +struct xfs_bmap_free; +struct xfs_inode; +struct xfs_mount; +struct xfs_trans; + +extern kmem_zone_t *xfs_btree_cur_zone; + +/* + * Generic key, ptr and record wrapper structures. + * + * These are disk format structures, and are converted where necessary + * by the btree specific code that needs to interpret them. + */ +union xfs_btree_ptr { + __be32 s; /* short form ptr */ + __be64 l; /* long form ptr */ +}; + +union xfs_btree_key { + xfs_bmbt_key_t bmbt; + xfs_bmdr_key_t bmbr; /* bmbt root block */ + xfs_alloc_key_t alloc; + xfs_inobt_key_t inobt; +}; + +union xfs_btree_rec { + xfs_bmbt_rec_t bmbt; + xfs_bmdr_rec_t bmbr; /* bmbt root block */ + xfs_alloc_rec_t alloc; + xfs_inobt_rec_t inobt; +}; + +/* + * This nonsense is to make -wlint happy. + */ +#define XFS_LOOKUP_EQ ((xfs_lookup_t)XFS_LOOKUP_EQi) +#define XFS_LOOKUP_LE ((xfs_lookup_t)XFS_LOOKUP_LEi) +#define XFS_LOOKUP_GE ((xfs_lookup_t)XFS_LOOKUP_GEi) + +#define XFS_BTNUM_BNO ((xfs_btnum_t)XFS_BTNUM_BNOi) +#define XFS_BTNUM_CNT ((xfs_btnum_t)XFS_BTNUM_CNTi) +#define XFS_BTNUM_BMAP ((xfs_btnum_t)XFS_BTNUM_BMAPi) +#define XFS_BTNUM_INO ((xfs_btnum_t)XFS_BTNUM_INOi) +#define XFS_BTNUM_FINO ((xfs_btnum_t)XFS_BTNUM_FINOi) + +/* + * For logging record fields. + */ +#define XFS_BB_MAGIC (1 << 0) +#define XFS_BB_LEVEL (1 << 1) +#define XFS_BB_NUMRECS (1 << 2) +#define XFS_BB_LEFTSIB (1 << 3) +#define XFS_BB_RIGHTSIB (1 << 4) +#define XFS_BB_BLKNO (1 << 5) +#define XFS_BB_LSN (1 << 6) +#define XFS_BB_UUID (1 << 7) +#define XFS_BB_OWNER (1 << 8) +#define XFS_BB_NUM_BITS 5 +#define XFS_BB_ALL_BITS ((1 << XFS_BB_NUM_BITS) - 1) +#define XFS_BB_NUM_BITS_CRC 9 +#define XFS_BB_ALL_BITS_CRC ((1 << XFS_BB_NUM_BITS_CRC) - 1) + +/* + * Generic stats interface + */ +#define __XFS_BTREE_STATS_INC(type, stat) \ + XFS_STATS_INC(xs_ ## type ## _2_ ## stat) +#define XFS_BTREE_STATS_INC(cur, stat) \ +do { \ + switch (cur->bc_btnum) { \ + case XFS_BTNUM_BNO: __XFS_BTREE_STATS_INC(abtb, stat); break; \ + case XFS_BTNUM_CNT: __XFS_BTREE_STATS_INC(abtc, stat); break; \ + case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_INC(bmbt, stat); break; \ + case XFS_BTNUM_INO: __XFS_BTREE_STATS_INC(ibt, stat); break; \ + case XFS_BTNUM_FINO: __XFS_BTREE_STATS_INC(fibt, stat); break; \ + case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \ + } \ +} while (0) + +#define __XFS_BTREE_STATS_ADD(type, stat, val) \ + XFS_STATS_ADD(xs_ ## type ## _2_ ## stat, val) +#define XFS_BTREE_STATS_ADD(cur, stat, val) \ +do { \ + switch (cur->bc_btnum) { \ + case XFS_BTNUM_BNO: __XFS_BTREE_STATS_ADD(abtb, stat, val); break; \ + case XFS_BTNUM_CNT: __XFS_BTREE_STATS_ADD(abtc, stat, val); break; \ + case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_ADD(bmbt, stat, val); break; \ + case XFS_BTNUM_INO: __XFS_BTREE_STATS_ADD(ibt, stat, val); break; \ + case XFS_BTNUM_FINO: __XFS_BTREE_STATS_ADD(fibt, stat, val); break; \ + case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \ + } \ +} while (0) + +#define XFS_BTREE_MAXLEVELS 8 /* max of all btrees */ + +struct xfs_btree_ops { + /* size of the key and record structures */ + size_t key_len; + size_t rec_len; + + /* cursor operations */ + struct xfs_btree_cur *(*dup_cursor)(struct xfs_btree_cur *); + void (*update_cursor)(struct xfs_btree_cur *src, + struct xfs_btree_cur *dst); + + /* update btree root pointer */ + void (*set_root)(struct xfs_btree_cur *cur, + union xfs_btree_ptr *nptr, int level_change); + + /* block allocation / freeing */ + int (*alloc_block)(struct xfs_btree_cur *cur, + union xfs_btree_ptr *start_bno, + union xfs_btree_ptr *new_bno, + int *stat); + int (*free_block)(struct xfs_btree_cur *cur, struct xfs_buf *bp); + + /* update last record information */ + void (*update_lastrec)(struct xfs_btree_cur *cur, + struct xfs_btree_block *block, + union xfs_btree_rec *rec, + int ptr, int reason); + + /* records in block/level */ + int (*get_minrecs)(struct xfs_btree_cur *cur, int level); + int (*get_maxrecs)(struct xfs_btree_cur *cur, int level); + + /* records on disk. Matter for the root in inode case. */ + int (*get_dmaxrecs)(struct xfs_btree_cur *cur, int level); + + /* init values of btree structures */ + void (*init_key_from_rec)(union xfs_btree_key *key, + union xfs_btree_rec *rec); + void (*init_rec_from_key)(union xfs_btree_key *key, + union xfs_btree_rec *rec); + void (*init_rec_from_cur)(struct xfs_btree_cur *cur, + union xfs_btree_rec *rec); + void (*init_ptr_from_cur)(struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr); + + /* difference between key value and cursor value */ + __int64_t (*key_diff)(struct xfs_btree_cur *cur, + union xfs_btree_key *key); + + const struct xfs_buf_ops *buf_ops; + +#if defined(DEBUG) || defined(XFS_WARN) + /* check that k1 is lower than k2 */ + int (*keys_inorder)(struct xfs_btree_cur *cur, + union xfs_btree_key *k1, + union xfs_btree_key *k2); + + /* check that r1 is lower than r2 */ + int (*recs_inorder)(struct xfs_btree_cur *cur, + union xfs_btree_rec *r1, + union xfs_btree_rec *r2); +#endif +}; + +/* + * Reasons for the update_lastrec method to be called. + */ +#define LASTREC_UPDATE 0 +#define LASTREC_INSREC 1 +#define LASTREC_DELREC 2 + + +/* + * Btree cursor structure. + * This collects all information needed by the btree code in one place. + */ +typedef struct xfs_btree_cur +{ + struct xfs_trans *bc_tp; /* transaction we're in, if any */ + struct xfs_mount *bc_mp; /* file system mount struct */ + const struct xfs_btree_ops *bc_ops; + uint bc_flags; /* btree features - below */ + union { + xfs_alloc_rec_incore_t a; + xfs_bmbt_irec_t b; + xfs_inobt_rec_incore_t i; + } bc_rec; /* current insert/search record value */ + struct xfs_buf *bc_bufs[XFS_BTREE_MAXLEVELS]; /* buf ptr per level */ + int bc_ptrs[XFS_BTREE_MAXLEVELS]; /* key/record # */ + __uint8_t bc_ra[XFS_BTREE_MAXLEVELS]; /* readahead bits */ +#define XFS_BTCUR_LEFTRA 1 /* left sibling has been read-ahead */ +#define XFS_BTCUR_RIGHTRA 2 /* right sibling has been read-ahead */ + __uint8_t bc_nlevels; /* number of levels in the tree */ + __uint8_t bc_blocklog; /* log2(blocksize) of btree blocks */ + xfs_btnum_t bc_btnum; /* identifies which btree type */ + union { + struct { /* needed for BNO, CNT, INO */ + struct xfs_buf *agbp; /* agf/agi buffer pointer */ + xfs_agnumber_t agno; /* ag number */ + } a; + struct { /* needed for BMAP */ + struct xfs_inode *ip; /* pointer to our inode */ + struct xfs_bmap_free *flist; /* list to free after */ + xfs_fsblock_t firstblock; /* 1st blk allocated */ + int allocated; /* count of alloced */ + short forksize; /* fork's inode space */ + char whichfork; /* data or attr fork */ + char flags; /* flags */ +#define XFS_BTCUR_BPRV_WASDEL 1 /* was delayed */ + } b; + } bc_private; /* per-btree type data */ +} xfs_btree_cur_t; + +/* cursor flags */ +#define XFS_BTREE_LONG_PTRS (1<<0) /* pointers are 64bits long */ +#define XFS_BTREE_ROOT_IN_INODE (1<<1) /* root may be variable size */ +#define XFS_BTREE_LASTREC_UPDATE (1<<2) /* track last rec externally */ +#define XFS_BTREE_CRC_BLOCKS (1<<3) /* uses extended btree blocks */ + + +#define XFS_BTREE_NOERROR 0 +#define XFS_BTREE_ERROR 1 + +/* + * Convert from buffer to btree block header. + */ +#define XFS_BUF_TO_BLOCK(bp) ((struct xfs_btree_block *)((bp)->b_addr)) + + +/* + * Check that block header is ok. + */ +int +xfs_btree_check_block( + struct xfs_btree_cur *cur, /* btree cursor */ + struct xfs_btree_block *block, /* generic btree block pointer */ + int level, /* level of the btree block */ + struct xfs_buf *bp); /* buffer containing block, if any */ + +/* + * Check that (long) pointer is ok. + */ +int /* error (0 or EFSCORRUPTED) */ +xfs_btree_check_lptr( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_fsblock_t ptr, /* btree block disk address */ + int level); /* btree block level */ + +/* + * Delete the btree cursor. + */ +void +xfs_btree_del_cursor( + xfs_btree_cur_t *cur, /* btree cursor */ + int error); /* del because of error */ + +/* + * Duplicate the btree cursor. + * Allocate a new one, copy the record, re-get the buffers. + */ +int /* error */ +xfs_btree_dup_cursor( + xfs_btree_cur_t *cur, /* input cursor */ + xfs_btree_cur_t **ncur);/* output cursor */ + +/* + * Get a buffer for the block, return it with no data read. + * Long-form addressing. + */ +struct xfs_buf * /* buffer for fsbno */ +xfs_btree_get_bufl( + struct xfs_mount *mp, /* file system mount point */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_fsblock_t fsbno, /* file system block number */ + uint lock); /* lock flags for get_buf */ + +/* + * Get a buffer for the block, return it with no data read. + * Short-form addressing. + */ +struct xfs_buf * /* buffer for agno/agbno */ +xfs_btree_get_bufs( + struct xfs_mount *mp, /* file system mount point */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_agnumber_t agno, /* allocation group number */ + xfs_agblock_t agbno, /* allocation group block number */ + uint lock); /* lock flags for get_buf */ + +/* + * Check for the cursor referring to the last block at the given level. + */ +int /* 1=is last block, 0=not last block */ +xfs_btree_islastblock( + xfs_btree_cur_t *cur, /* btree cursor */ + int level); /* level to check */ + +/* + * Compute first and last byte offsets for the fields given. + * Interprets the offsets table, which contains struct field offsets. + */ +void +xfs_btree_offsets( + __int64_t fields, /* bitmask of fields */ + const short *offsets,/* table of field offsets */ + int nbits, /* number of bits to inspect */ + int *first, /* output: first byte offset */ + int *last); /* output: last byte offset */ + +/* + * Get a buffer for the block, return it read in. + * Long-form addressing. + */ +int /* error */ +xfs_btree_read_bufl( + struct xfs_mount *mp, /* file system mount point */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_fsblock_t fsbno, /* file system block number */ + uint lock, /* lock flags for read_buf */ + struct xfs_buf **bpp, /* buffer for fsbno */ + int refval, /* ref count value for buffer */ + const struct xfs_buf_ops *ops); + +/* + * Read-ahead the block, don't wait for it, don't return a buffer. + * Long-form addressing. + */ +void /* error */ +xfs_btree_reada_bufl( + struct xfs_mount *mp, /* file system mount point */ + xfs_fsblock_t fsbno, /* file system block number */ + xfs_extlen_t count, /* count of filesystem blocks */ + const struct xfs_buf_ops *ops); + +/* + * Read-ahead the block, don't wait for it, don't return a buffer. + * Short-form addressing. + */ +void /* error */ +xfs_btree_reada_bufs( + struct xfs_mount *mp, /* file system mount point */ + xfs_agnumber_t agno, /* allocation group number */ + xfs_agblock_t agbno, /* allocation group block number */ + xfs_extlen_t count, /* count of filesystem blocks */ + const struct xfs_buf_ops *ops); + +/* + * Initialise a new btree block header + */ +void +xfs_btree_init_block( + struct xfs_mount *mp, + struct xfs_buf *bp, + __u32 magic, + __u16 level, + __u16 numrecs, + __u64 owner, + unsigned int flags); + +void +xfs_btree_init_block_int( + struct xfs_mount *mp, + struct xfs_btree_block *buf, + xfs_daddr_t blkno, + __u32 magic, + __u16 level, + __u16 numrecs, + __u64 owner, + unsigned int flags); + +/* + * Common btree core entry points. + */ +int xfs_btree_increment(struct xfs_btree_cur *, int, int *); +int xfs_btree_decrement(struct xfs_btree_cur *, int, int *); +int xfs_btree_lookup(struct xfs_btree_cur *, xfs_lookup_t, int *); +int xfs_btree_update(struct xfs_btree_cur *, union xfs_btree_rec *); +int xfs_btree_new_iroot(struct xfs_btree_cur *, int *, int *); +int xfs_btree_insert(struct xfs_btree_cur *, int *); +int xfs_btree_delete(struct xfs_btree_cur *, int *); +int xfs_btree_get_rec(struct xfs_btree_cur *, union xfs_btree_rec **, int *); +int xfs_btree_change_owner(struct xfs_btree_cur *cur, __uint64_t new_owner, + struct list_head *buffer_list); + +/* + * btree block CRC helpers + */ +void xfs_btree_lblock_calc_crc(struct xfs_buf *); +bool xfs_btree_lblock_verify_crc(struct xfs_buf *); +void xfs_btree_sblock_calc_crc(struct xfs_buf *); +bool xfs_btree_sblock_verify_crc(struct xfs_buf *); + +/* + * Internal btree helpers also used by xfs_bmap.c. + */ +void xfs_btree_log_block(struct xfs_btree_cur *, struct xfs_buf *, int); +void xfs_btree_log_recs(struct xfs_btree_cur *, struct xfs_buf *, int, int); + +/* + * Helpers. + */ +static inline int xfs_btree_get_numrecs(struct xfs_btree_block *block) +{ + return be16_to_cpu(block->bb_numrecs); +} + +static inline void xfs_btree_set_numrecs(struct xfs_btree_block *block, + __uint16_t numrecs) +{ + block->bb_numrecs = cpu_to_be16(numrecs); +} + +static inline int xfs_btree_get_level(struct xfs_btree_block *block) +{ + return be16_to_cpu(block->bb_level); +} + + +/* + * Min and max functions for extlen, agblock, fileoff, and filblks types. + */ +#define XFS_EXTLEN_MIN(a,b) min_t(xfs_extlen_t, (a), (b)) +#define XFS_EXTLEN_MAX(a,b) max_t(xfs_extlen_t, (a), (b)) +#define XFS_AGBLOCK_MIN(a,b) min_t(xfs_agblock_t, (a), (b)) +#define XFS_AGBLOCK_MAX(a,b) max_t(xfs_agblock_t, (a), (b)) +#define XFS_FILEOFF_MIN(a,b) min_t(xfs_fileoff_t, (a), (b)) +#define XFS_FILEOFF_MAX(a,b) max_t(xfs_fileoff_t, (a), (b)) +#define XFS_FILBLKS_MIN(a,b) min_t(xfs_filblks_t, (a), (b)) +#define XFS_FILBLKS_MAX(a,b) max_t(xfs_filblks_t, (a), (b)) + +#define XFS_FSB_SANITY_CHECK(mp,fsb) \ + (XFS_FSB_TO_AGNO(mp, fsb) < mp->m_sb.sb_agcount && \ + XFS_FSB_TO_AGBNO(mp, fsb) < mp->m_sb.sb_agblocks) + +/* + * Trace hooks. Currently not implemented as they need to be ported + * over to the generic tracing functionality, which is some effort. + * + * i,j = integer (32 bit) + * b = btree block buffer (xfs_buf_t) + * p = btree ptr + * r = btree record + * k = btree key + */ +#define XFS_BTREE_TRACE_ARGBI(c, b, i) +#define XFS_BTREE_TRACE_ARGBII(c, b, i, j) +#define XFS_BTREE_TRACE_ARGI(c, i) +#define XFS_BTREE_TRACE_ARGIPK(c, i, p, s) +#define XFS_BTREE_TRACE_ARGIPR(c, i, p, r) +#define XFS_BTREE_TRACE_ARGIK(c, i, k) +#define XFS_BTREE_TRACE_ARGR(c, r) +#define XFS_BTREE_TRACE_CURSOR(c, t) + +#endif /* __XFS_BTREE_H__ */ diff --git a/kernel/fs/xfs/libxfs/xfs_cksum.h b/kernel/fs/xfs/libxfs/xfs_cksum.h new file mode 100644 index 000000000..fad1676ad --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_cksum.h @@ -0,0 +1,63 @@ +#ifndef _XFS_CKSUM_H +#define _XFS_CKSUM_H 1 + +#define XFS_CRC_SEED (~(__uint32_t)0) + +/* + * Calculate the intermediate checksum for a buffer that has the CRC field + * inside it. The offset of the 32bit crc fields is passed as the + * cksum_offset parameter. + */ +static inline __uint32_t +xfs_start_cksum(char *buffer, size_t length, unsigned long cksum_offset) +{ + __uint32_t zero = 0; + __uint32_t crc; + + /* Calculate CRC up to the checksum. */ + crc = crc32c(XFS_CRC_SEED, buffer, cksum_offset); + + /* Skip checksum field */ + crc = crc32c(crc, &zero, sizeof(__u32)); + + /* Calculate the rest of the CRC. */ + return crc32c(crc, &buffer[cksum_offset + sizeof(__be32)], + length - (cksum_offset + sizeof(__be32))); +} + +/* + * Convert the intermediate checksum to the final ondisk format. + * + * The CRC32c calculation uses LE format even on BE machines, but returns the + * result in host endian format. Hence we need to byte swap it back to LE format + * so that it is consistent on disk. + */ +static inline __le32 +xfs_end_cksum(__uint32_t crc) +{ + return ~cpu_to_le32(crc); +} + +/* + * Helper to generate the checksum for a buffer. + */ +static inline void +xfs_update_cksum(char *buffer, size_t length, unsigned long cksum_offset) +{ + __uint32_t crc = xfs_start_cksum(buffer, length, cksum_offset); + + *(__le32 *)(buffer + cksum_offset) = xfs_end_cksum(crc); +} + +/* + * Helper to verify the checksum for a buffer. + */ +static inline int +xfs_verify_cksum(char *buffer, size_t length, unsigned long cksum_offset) +{ + __uint32_t crc = xfs_start_cksum(buffer, length, cksum_offset); + + return *(__le32 *)(buffer + cksum_offset) == xfs_end_cksum(crc); +} + +#endif /* _XFS_CKSUM_H */ diff --git a/kernel/fs/xfs/libxfs/xfs_da_btree.c b/kernel/fs/xfs/libxfs/xfs_da_btree.c new file mode 100644 index 000000000..2385f8cd0 --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_da_btree.c @@ -0,0 +1,2660 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_inode_item.h" +#include "xfs_alloc.h" +#include "xfs_bmap.h" +#include "xfs_attr.h" +#include "xfs_attr_leaf.h" +#include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_cksum.h" +#include "xfs_buf_item.h" + +/* + * xfs_da_btree.c + * + * Routines to implement directories as Btrees of hashed names. + */ + +/*======================================================================== + * Function prototypes for the kernel. + *========================================================================*/ + +/* + * Routines used for growing the Btree. + */ +STATIC int xfs_da3_root_split(xfs_da_state_t *state, + xfs_da_state_blk_t *existing_root, + xfs_da_state_blk_t *new_child); +STATIC int xfs_da3_node_split(xfs_da_state_t *state, + xfs_da_state_blk_t *existing_blk, + xfs_da_state_blk_t *split_blk, + xfs_da_state_blk_t *blk_to_add, + int treelevel, + int *result); +STATIC void xfs_da3_node_rebalance(xfs_da_state_t *state, + xfs_da_state_blk_t *node_blk_1, + xfs_da_state_blk_t *node_blk_2); +STATIC void xfs_da3_node_add(xfs_da_state_t *state, + xfs_da_state_blk_t *old_node_blk, + xfs_da_state_blk_t *new_node_blk); + +/* + * Routines used for shrinking the Btree. + */ +STATIC int xfs_da3_root_join(xfs_da_state_t *state, + xfs_da_state_blk_t *root_blk); +STATIC int xfs_da3_node_toosmall(xfs_da_state_t *state, int *retval); +STATIC void xfs_da3_node_remove(xfs_da_state_t *state, + xfs_da_state_blk_t *drop_blk); +STATIC void xfs_da3_node_unbalance(xfs_da_state_t *state, + xfs_da_state_blk_t *src_node_blk, + xfs_da_state_blk_t *dst_node_blk); + +/* + * Utility routines. + */ +STATIC int xfs_da3_blk_unlink(xfs_da_state_t *state, + xfs_da_state_blk_t *drop_blk, + xfs_da_state_blk_t *save_blk); + + +kmem_zone_t *xfs_da_state_zone; /* anchor for state struct zone */ + +/* + * Allocate a dir-state structure. + * We don't put them on the stack since they're large. + */ +xfs_da_state_t * +xfs_da_state_alloc(void) +{ + return kmem_zone_zalloc(xfs_da_state_zone, KM_NOFS); +} + +/* + * Kill the altpath contents of a da-state structure. + */ +STATIC void +xfs_da_state_kill_altpath(xfs_da_state_t *state) +{ + int i; + + for (i = 0; i < state->altpath.active; i++) + state->altpath.blk[i].bp = NULL; + state->altpath.active = 0; +} + +/* + * Free a da-state structure. + */ +void +xfs_da_state_free(xfs_da_state_t *state) +{ + xfs_da_state_kill_altpath(state); +#ifdef DEBUG + memset((char *)state, 0, sizeof(*state)); +#endif /* DEBUG */ + kmem_zone_free(xfs_da_state_zone, state); +} + +static bool +xfs_da3_node_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_da_intnode *hdr = bp->b_addr; + struct xfs_da3_icnode_hdr ichdr; + const struct xfs_dir_ops *ops; + + ops = xfs_dir_get_ops(mp, NULL); + + ops->node_hdr_from_disk(&ichdr, hdr); + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_da3_node_hdr *hdr3 = bp->b_addr; + + if (ichdr.magic != XFS_DA3_NODE_MAGIC) + return false; + + if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_uuid)) + return false; + if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn) + return false; + } else { + if (ichdr.magic != XFS_DA_NODE_MAGIC) + return false; + } + if (ichdr.level == 0) + return false; + if (ichdr.level > XFS_DA_NODE_MAXDEPTH) + return false; + if (ichdr.count == 0) + return false; + + /* + * we don't know if the node is for and attribute or directory tree, + * so only fail if the count is outside both bounds + */ + if (ichdr.count > mp->m_dir_geo->node_ents && + ichdr.count > mp->m_attr_geo->node_ents) + return false; + + /* XXX: hash order check? */ + + return true; +} + +static void +xfs_da3_node_write_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; + struct xfs_da3_node_hdr *hdr3 = bp->b_addr; + + if (!xfs_da3_node_verify(bp)) { + xfs_buf_ioerror(bp, -EFSCORRUPTED); + xfs_verifier_error(bp); + return; + } + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (bip) + hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn); + + xfs_buf_update_cksum(bp, XFS_DA3_NODE_CRC_OFF); +} + +/* + * leaf/node format detection on trees is sketchy, so a node read can be done on + * leaf level blocks when detection identifies the tree as a node format tree + * incorrectly. In this case, we need to swap the verifier to match the correct + * format of the block being read. + */ +static void +xfs_da3_node_read_verify( + struct xfs_buf *bp) +{ + struct xfs_da_blkinfo *info = bp->b_addr; + + switch (be16_to_cpu(info->magic)) { + case XFS_DA3_NODE_MAGIC: + if (!xfs_buf_verify_cksum(bp, XFS_DA3_NODE_CRC_OFF)) { + xfs_buf_ioerror(bp, -EFSBADCRC); + break; + } + /* fall through */ + case XFS_DA_NODE_MAGIC: + if (!xfs_da3_node_verify(bp)) { + xfs_buf_ioerror(bp, -EFSCORRUPTED); + break; + } + return; + case XFS_ATTR_LEAF_MAGIC: + case XFS_ATTR3_LEAF_MAGIC: + bp->b_ops = &xfs_attr3_leaf_buf_ops; + bp->b_ops->verify_read(bp); + return; + case XFS_DIR2_LEAFN_MAGIC: + case XFS_DIR3_LEAFN_MAGIC: + bp->b_ops = &xfs_dir3_leafn_buf_ops; + bp->b_ops->verify_read(bp); + return; + default: + break; + } + + /* corrupt block */ + xfs_verifier_error(bp); +} + +const struct xfs_buf_ops xfs_da3_node_buf_ops = { + .verify_read = xfs_da3_node_read_verify, + .verify_write = xfs_da3_node_write_verify, +}; + +int +xfs_da3_node_read( + struct xfs_trans *tp, + struct xfs_inode *dp, + xfs_dablk_t bno, + xfs_daddr_t mappedbno, + struct xfs_buf **bpp, + int which_fork) +{ + int err; + + err = xfs_da_read_buf(tp, dp, bno, mappedbno, bpp, + which_fork, &xfs_da3_node_buf_ops); + if (!err && tp) { + struct xfs_da_blkinfo *info = (*bpp)->b_addr; + int type; + + switch (be16_to_cpu(info->magic)) { + case XFS_DA_NODE_MAGIC: + case XFS_DA3_NODE_MAGIC: + type = XFS_BLFT_DA_NODE_BUF; + break; + case XFS_ATTR_LEAF_MAGIC: + case XFS_ATTR3_LEAF_MAGIC: + type = XFS_BLFT_ATTR_LEAF_BUF; + break; + case XFS_DIR2_LEAFN_MAGIC: + case XFS_DIR3_LEAFN_MAGIC: + type = XFS_BLFT_DIR_LEAFN_BUF; + break; + default: + type = 0; + ASSERT(0); + break; + } + xfs_trans_buf_set_type(tp, *bpp, type); + } + return err; +} + +/*======================================================================== + * Routines used for growing the Btree. + *========================================================================*/ + +/* + * Create the initial contents of an intermediate node. + */ +int +xfs_da3_node_create( + struct xfs_da_args *args, + xfs_dablk_t blkno, + int level, + struct xfs_buf **bpp, + int whichfork) +{ + struct xfs_da_intnode *node; + struct xfs_trans *tp = args->trans; + struct xfs_mount *mp = tp->t_mountp; + struct xfs_da3_icnode_hdr ichdr = {0}; + struct xfs_buf *bp; + int error; + struct xfs_inode *dp = args->dp; + + trace_xfs_da_node_create(args); + ASSERT(level <= XFS_DA_NODE_MAXDEPTH); + + error = xfs_da_get_buf(tp, dp, blkno, -1, &bp, whichfork); + if (error) + return error; + bp->b_ops = &xfs_da3_node_buf_ops; + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DA_NODE_BUF); + node = bp->b_addr; + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_da3_node_hdr *hdr3 = bp->b_addr; + + ichdr.magic = XFS_DA3_NODE_MAGIC; + hdr3->info.blkno = cpu_to_be64(bp->b_bn); + hdr3->info.owner = cpu_to_be64(args->dp->i_ino); + uuid_copy(&hdr3->info.uuid, &mp->m_sb.sb_uuid); + } else { + ichdr.magic = XFS_DA_NODE_MAGIC; + } + ichdr.level = level; + + dp->d_ops->node_hdr_to_disk(node, &ichdr); + xfs_trans_log_buf(tp, bp, + XFS_DA_LOGRANGE(node, &node->hdr, dp->d_ops->node_hdr_size)); + + *bpp = bp; + return 0; +} + +/* + * Split a leaf node, rebalance, then possibly split + * intermediate nodes, rebalance, etc. + */ +int /* error */ +xfs_da3_split( + struct xfs_da_state *state) +{ + struct xfs_da_state_blk *oldblk; + struct xfs_da_state_blk *newblk; + struct xfs_da_state_blk *addblk; + struct xfs_da_intnode *node; + struct xfs_buf *bp; + int max; + int action = 0; + int error; + int i; + + trace_xfs_da_split(state->args); + + /* + * Walk back up the tree splitting/inserting/adjusting as necessary. + * If we need to insert and there isn't room, split the node, then + * decide which fragment to insert the new block from below into. + * Note that we may split the root this way, but we need more fixup. + */ + max = state->path.active - 1; + ASSERT((max >= 0) && (max < XFS_DA_NODE_MAXDEPTH)); + ASSERT(state->path.blk[max].magic == XFS_ATTR_LEAF_MAGIC || + state->path.blk[max].magic == XFS_DIR2_LEAFN_MAGIC); + + addblk = &state->path.blk[max]; /* initial dummy value */ + for (i = max; (i >= 0) && addblk; state->path.active--, i--) { + oldblk = &state->path.blk[i]; + newblk = &state->altpath.blk[i]; + + /* + * If a leaf node then + * Allocate a new leaf node, then rebalance across them. + * else if an intermediate node then + * We split on the last layer, must we split the node? + */ + switch (oldblk->magic) { + case XFS_ATTR_LEAF_MAGIC: + error = xfs_attr3_leaf_split(state, oldblk, newblk); + if ((error != 0) && (error != -ENOSPC)) { + return error; /* GROT: attr is inconsistent */ + } + if (!error) { + addblk = newblk; + break; + } + /* + * Entry wouldn't fit, split the leaf again. + */ + state->extravalid = 1; + if (state->inleaf) { + state->extraafter = 0; /* before newblk */ + trace_xfs_attr_leaf_split_before(state->args); + error = xfs_attr3_leaf_split(state, oldblk, + &state->extrablk); + } else { + state->extraafter = 1; /* after newblk */ + trace_xfs_attr_leaf_split_after(state->args); + error = xfs_attr3_leaf_split(state, newblk, + &state->extrablk); + } + if (error) + return error; /* GROT: attr inconsistent */ + addblk = newblk; + break; + case XFS_DIR2_LEAFN_MAGIC: + error = xfs_dir2_leafn_split(state, oldblk, newblk); + if (error) + return error; + addblk = newblk; + break; + case XFS_DA_NODE_MAGIC: + error = xfs_da3_node_split(state, oldblk, newblk, addblk, + max - i, &action); + addblk->bp = NULL; + if (error) + return error; /* GROT: dir is inconsistent */ + /* + * Record the newly split block for the next time thru? + */ + if (action) + addblk = newblk; + else + addblk = NULL; + break; + } + + /* + * Update the btree to show the new hashval for this child. + */ + xfs_da3_fixhashpath(state, &state->path); + } + if (!addblk) + return 0; + + /* + * Split the root node. + */ + ASSERT(state->path.active == 0); + oldblk = &state->path.blk[0]; + error = xfs_da3_root_split(state, oldblk, addblk); + if (error) { + addblk->bp = NULL; + return error; /* GROT: dir is inconsistent */ + } + + /* + * Update pointers to the node which used to be block 0 and + * just got bumped because of the addition of a new root node. + * There might be three blocks involved if a double split occurred, + * and the original block 0 could be at any position in the list. + * + * Note: the magic numbers and sibling pointers are in the same + * physical place for both v2 and v3 headers (by design). Hence it + * doesn't matter which version of the xfs_da_intnode structure we use + * here as the result will be the same using either structure. + */ + node = oldblk->bp->b_addr; + if (node->hdr.info.forw) { + if (be32_to_cpu(node->hdr.info.forw) == addblk->blkno) { + bp = addblk->bp; + } else { + ASSERT(state->extravalid); + bp = state->extrablk.bp; + } + node = bp->b_addr; + node->hdr.info.back = cpu_to_be32(oldblk->blkno); + xfs_trans_log_buf(state->args->trans, bp, + XFS_DA_LOGRANGE(node, &node->hdr.info, + sizeof(node->hdr.info))); + } + node = oldblk->bp->b_addr; + if (node->hdr.info.back) { + if (be32_to_cpu(node->hdr.info.back) == addblk->blkno) { + bp = addblk->bp; + } else { + ASSERT(state->extravalid); + bp = state->extrablk.bp; + } + node = bp->b_addr; + node->hdr.info.forw = cpu_to_be32(oldblk->blkno); + xfs_trans_log_buf(state->args->trans, bp, + XFS_DA_LOGRANGE(node, &node->hdr.info, + sizeof(node->hdr.info))); + } + addblk->bp = NULL; + return 0; +} + +/* + * Split the root. We have to create a new root and point to the two + * parts (the split old root) that we just created. Copy block zero to + * the EOF, extending the inode in process. + */ +STATIC int /* error */ +xfs_da3_root_split( + struct xfs_da_state *state, + struct xfs_da_state_blk *blk1, + struct xfs_da_state_blk *blk2) +{ + struct xfs_da_intnode *node; + struct xfs_da_intnode *oldroot; + struct xfs_da_node_entry *btree; + struct xfs_da3_icnode_hdr nodehdr; + struct xfs_da_args *args; + struct xfs_buf *bp; + struct xfs_inode *dp; + struct xfs_trans *tp; + struct xfs_dir2_leaf *leaf; + xfs_dablk_t blkno; + int level; + int error; + int size; + + trace_xfs_da_root_split(state->args); + + /* + * Copy the existing (incorrect) block from the root node position + * to a free space somewhere. + */ + args = state->args; + error = xfs_da_grow_inode(args, &blkno); + if (error) + return error; + + dp = args->dp; + tp = args->trans; + error = xfs_da_get_buf(tp, dp, blkno, -1, &bp, args->whichfork); + if (error) + return error; + node = bp->b_addr; + oldroot = blk1->bp->b_addr; + if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC) || + oldroot->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC)) { + struct xfs_da3_icnode_hdr icnodehdr; + + dp->d_ops->node_hdr_from_disk(&icnodehdr, oldroot); + btree = dp->d_ops->node_tree_p(oldroot); + size = (int)((char *)&btree[icnodehdr.count] - (char *)oldroot); + level = icnodehdr.level; + + /* + * we are about to copy oldroot to bp, so set up the type + * of bp while we know exactly what it will be. + */ + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DA_NODE_BUF); + } else { + struct xfs_dir3_icleaf_hdr leafhdr; + struct xfs_dir2_leaf_entry *ents; + + leaf = (xfs_dir2_leaf_t *)oldroot; + dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + ents = dp->d_ops->leaf_ents_p(leaf); + + ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC || + leafhdr.magic == XFS_DIR3_LEAFN_MAGIC); + size = (int)((char *)&ents[leafhdr.count] - (char *)leaf); + level = 0; + + /* + * we are about to copy oldroot to bp, so set up the type + * of bp while we know exactly what it will be. + */ + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_LEAFN_BUF); + } + + /* + * we can copy most of the information in the node from one block to + * another, but for CRC enabled headers we have to make sure that the + * block specific identifiers are kept intact. We update the buffer + * directly for this. + */ + memcpy(node, oldroot, size); + if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC) || + oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) { + struct xfs_da3_intnode *node3 = (struct xfs_da3_intnode *)node; + + node3->hdr.info.blkno = cpu_to_be64(bp->b_bn); + } + xfs_trans_log_buf(tp, bp, 0, size - 1); + + bp->b_ops = blk1->bp->b_ops; + xfs_trans_buf_copy_type(bp, blk1->bp); + blk1->bp = bp; + blk1->blkno = blkno; + + /* + * Set up the new root node. + */ + error = xfs_da3_node_create(args, + (args->whichfork == XFS_DATA_FORK) ? args->geo->leafblk : 0, + level + 1, &bp, args->whichfork); + if (error) + return error; + + node = bp->b_addr; + dp->d_ops->node_hdr_from_disk(&nodehdr, node); + btree = dp->d_ops->node_tree_p(node); + btree[0].hashval = cpu_to_be32(blk1->hashval); + btree[0].before = cpu_to_be32(blk1->blkno); + btree[1].hashval = cpu_to_be32(blk2->hashval); + btree[1].before = cpu_to_be32(blk2->blkno); + nodehdr.count = 2; + dp->d_ops->node_hdr_to_disk(node, &nodehdr); + +#ifdef DEBUG + if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) || + oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) { + ASSERT(blk1->blkno >= args->geo->leafblk && + blk1->blkno < args->geo->freeblk); + ASSERT(blk2->blkno >= args->geo->leafblk && + blk2->blkno < args->geo->freeblk); + } +#endif + + /* Header is already logged by xfs_da_node_create */ + xfs_trans_log_buf(tp, bp, + XFS_DA_LOGRANGE(node, btree, sizeof(xfs_da_node_entry_t) * 2)); + + return 0; +} + +/* + * Split the node, rebalance, then add the new entry. + */ +STATIC int /* error */ +xfs_da3_node_split( + struct xfs_da_state *state, + struct xfs_da_state_blk *oldblk, + struct xfs_da_state_blk *newblk, + struct xfs_da_state_blk *addblk, + int treelevel, + int *result) +{ + struct xfs_da_intnode *node; + struct xfs_da3_icnode_hdr nodehdr; + xfs_dablk_t blkno; + int newcount; + int error; + int useextra; + struct xfs_inode *dp = state->args->dp; + + trace_xfs_da_node_split(state->args); + + node = oldblk->bp->b_addr; + dp->d_ops->node_hdr_from_disk(&nodehdr, node); + + /* + * With V2 dirs the extra block is data or freespace. + */ + useextra = state->extravalid && state->args->whichfork == XFS_ATTR_FORK; + newcount = 1 + useextra; + /* + * Do we have to split the node? + */ + if (nodehdr.count + newcount > state->args->geo->node_ents) { + /* + * Allocate a new node, add to the doubly linked chain of + * nodes, then move some of our excess entries into it. + */ + error = xfs_da_grow_inode(state->args, &blkno); + if (error) + return error; /* GROT: dir is inconsistent */ + + error = xfs_da3_node_create(state->args, blkno, treelevel, + &newblk->bp, state->args->whichfork); + if (error) + return error; /* GROT: dir is inconsistent */ + newblk->blkno = blkno; + newblk->magic = XFS_DA_NODE_MAGIC; + xfs_da3_node_rebalance(state, oldblk, newblk); + error = xfs_da3_blk_link(state, oldblk, newblk); + if (error) + return error; + *result = 1; + } else { + *result = 0; + } + + /* + * Insert the new entry(s) into the correct block + * (updating last hashval in the process). + * + * xfs_da3_node_add() inserts BEFORE the given index, + * and as a result of using node_lookup_int() we always + * point to a valid entry (not after one), but a split + * operation always results in a new block whose hashvals + * FOLLOW the current block. + * + * If we had double-split op below us, then add the extra block too. + */ + node = oldblk->bp->b_addr; + dp->d_ops->node_hdr_from_disk(&nodehdr, node); + if (oldblk->index <= nodehdr.count) { + oldblk->index++; + xfs_da3_node_add(state, oldblk, addblk); + if (useextra) { + if (state->extraafter) + oldblk->index++; + xfs_da3_node_add(state, oldblk, &state->extrablk); + state->extravalid = 0; + } + } else { + newblk->index++; + xfs_da3_node_add(state, newblk, addblk); + if (useextra) { + if (state->extraafter) + newblk->index++; + xfs_da3_node_add(state, newblk, &state->extrablk); + state->extravalid = 0; + } + } + + return 0; +} + +/* + * Balance the btree elements between two intermediate nodes, + * usually one full and one empty. + * + * NOTE: if blk2 is empty, then it will get the upper half of blk1. + */ +STATIC void +xfs_da3_node_rebalance( + struct xfs_da_state *state, + struct xfs_da_state_blk *blk1, + struct xfs_da_state_blk *blk2) +{ + struct xfs_da_intnode *node1; + struct xfs_da_intnode *node2; + struct xfs_da_intnode *tmpnode; + struct xfs_da_node_entry *btree1; + struct xfs_da_node_entry *btree2; + struct xfs_da_node_entry *btree_s; + struct xfs_da_node_entry *btree_d; + struct xfs_da3_icnode_hdr nodehdr1; + struct xfs_da3_icnode_hdr nodehdr2; + struct xfs_trans *tp; + int count; + int tmp; + int swap = 0; + struct xfs_inode *dp = state->args->dp; + + trace_xfs_da_node_rebalance(state->args); + + node1 = blk1->bp->b_addr; + node2 = blk2->bp->b_addr; + dp->d_ops->node_hdr_from_disk(&nodehdr1, node1); + dp->d_ops->node_hdr_from_disk(&nodehdr2, node2); + btree1 = dp->d_ops->node_tree_p(node1); + btree2 = dp->d_ops->node_tree_p(node2); + + /* + * Figure out how many entries need to move, and in which direction. + * Swap the nodes around if that makes it simpler. + */ + if (nodehdr1.count > 0 && nodehdr2.count > 0 && + ((be32_to_cpu(btree2[0].hashval) < be32_to_cpu(btree1[0].hashval)) || + (be32_to_cpu(btree2[nodehdr2.count - 1].hashval) < + be32_to_cpu(btree1[nodehdr1.count - 1].hashval)))) { + tmpnode = node1; + node1 = node2; + node2 = tmpnode; + dp->d_ops->node_hdr_from_disk(&nodehdr1, node1); + dp->d_ops->node_hdr_from_disk(&nodehdr2, node2); + btree1 = dp->d_ops->node_tree_p(node1); + btree2 = dp->d_ops->node_tree_p(node2); + swap = 1; + } + + count = (nodehdr1.count - nodehdr2.count) / 2; + if (count == 0) + return; + tp = state->args->trans; + /* + * Two cases: high-to-low and low-to-high. + */ + if (count > 0) { + /* + * Move elements in node2 up to make a hole. + */ + tmp = nodehdr2.count; + if (tmp > 0) { + tmp *= (uint)sizeof(xfs_da_node_entry_t); + btree_s = &btree2[0]; + btree_d = &btree2[count]; + memmove(btree_d, btree_s, tmp); + } + + /* + * Move the req'd B-tree elements from high in node1 to + * low in node2. + */ + nodehdr2.count += count; + tmp = count * (uint)sizeof(xfs_da_node_entry_t); + btree_s = &btree1[nodehdr1.count - count]; + btree_d = &btree2[0]; + memcpy(btree_d, btree_s, tmp); + nodehdr1.count -= count; + } else { + /* + * Move the req'd B-tree elements from low in node2 to + * high in node1. + */ + count = -count; + tmp = count * (uint)sizeof(xfs_da_node_entry_t); + btree_s = &btree2[0]; + btree_d = &btree1[nodehdr1.count]; + memcpy(btree_d, btree_s, tmp); + nodehdr1.count += count; + + xfs_trans_log_buf(tp, blk1->bp, + XFS_DA_LOGRANGE(node1, btree_d, tmp)); + + /* + * Move elements in node2 down to fill the hole. + */ + tmp = nodehdr2.count - count; + tmp *= (uint)sizeof(xfs_da_node_entry_t); + btree_s = &btree2[count]; + btree_d = &btree2[0]; + memmove(btree_d, btree_s, tmp); + nodehdr2.count -= count; + } + + /* + * Log header of node 1 and all current bits of node 2. + */ + dp->d_ops->node_hdr_to_disk(node1, &nodehdr1); + xfs_trans_log_buf(tp, blk1->bp, + XFS_DA_LOGRANGE(node1, &node1->hdr, dp->d_ops->node_hdr_size)); + + dp->d_ops->node_hdr_to_disk(node2, &nodehdr2); + xfs_trans_log_buf(tp, blk2->bp, + XFS_DA_LOGRANGE(node2, &node2->hdr, + dp->d_ops->node_hdr_size + + (sizeof(btree2[0]) * nodehdr2.count))); + + /* + * Record the last hashval from each block for upward propagation. + * (note: don't use the swapped node pointers) + */ + if (swap) { + node1 = blk1->bp->b_addr; + node2 = blk2->bp->b_addr; + dp->d_ops->node_hdr_from_disk(&nodehdr1, node1); + dp->d_ops->node_hdr_from_disk(&nodehdr2, node2); + btree1 = dp->d_ops->node_tree_p(node1); + btree2 = dp->d_ops->node_tree_p(node2); + } + blk1->hashval = be32_to_cpu(btree1[nodehdr1.count - 1].hashval); + blk2->hashval = be32_to_cpu(btree2[nodehdr2.count - 1].hashval); + + /* + * Adjust the expected index for insertion. + */ + if (blk1->index >= nodehdr1.count) { + blk2->index = blk1->index - nodehdr1.count; + blk1->index = nodehdr1.count + 1; /* make it invalid */ + } +} + +/* + * Add a new entry to an intermediate node. + */ +STATIC void +xfs_da3_node_add( + struct xfs_da_state *state, + struct xfs_da_state_blk *oldblk, + struct xfs_da_state_blk *newblk) +{ + struct xfs_da_intnode *node; + struct xfs_da3_icnode_hdr nodehdr; + struct xfs_da_node_entry *btree; + int tmp; + struct xfs_inode *dp = state->args->dp; + + trace_xfs_da_node_add(state->args); + + node = oldblk->bp->b_addr; + dp->d_ops->node_hdr_from_disk(&nodehdr, node); + btree = dp->d_ops->node_tree_p(node); + + ASSERT(oldblk->index >= 0 && oldblk->index <= nodehdr.count); + ASSERT(newblk->blkno != 0); + if (state->args->whichfork == XFS_DATA_FORK) + ASSERT(newblk->blkno >= state->args->geo->leafblk && + newblk->blkno < state->args->geo->freeblk); + + /* + * We may need to make some room before we insert the new node. + */ + tmp = 0; + if (oldblk->index < nodehdr.count) { + tmp = (nodehdr.count - oldblk->index) * (uint)sizeof(*btree); + memmove(&btree[oldblk->index + 1], &btree[oldblk->index], tmp); + } + btree[oldblk->index].hashval = cpu_to_be32(newblk->hashval); + btree[oldblk->index].before = cpu_to_be32(newblk->blkno); + xfs_trans_log_buf(state->args->trans, oldblk->bp, + XFS_DA_LOGRANGE(node, &btree[oldblk->index], + tmp + sizeof(*btree))); + + nodehdr.count += 1; + dp->d_ops->node_hdr_to_disk(node, &nodehdr); + xfs_trans_log_buf(state->args->trans, oldblk->bp, + XFS_DA_LOGRANGE(node, &node->hdr, dp->d_ops->node_hdr_size)); + + /* + * Copy the last hash value from the oldblk to propagate upwards. + */ + oldblk->hashval = be32_to_cpu(btree[nodehdr.count - 1].hashval); +} + +/*======================================================================== + * Routines used for shrinking the Btree. + *========================================================================*/ + +/* + * Deallocate an empty leaf node, remove it from its parent, + * possibly deallocating that block, etc... + */ +int +xfs_da3_join( + struct xfs_da_state *state) +{ + struct xfs_da_state_blk *drop_blk; + struct xfs_da_state_blk *save_blk; + int action = 0; + int error; + + trace_xfs_da_join(state->args); + + drop_blk = &state->path.blk[ state->path.active-1 ]; + save_blk = &state->altpath.blk[ state->path.active-1 ]; + ASSERT(state->path.blk[0].magic == XFS_DA_NODE_MAGIC); + ASSERT(drop_blk->magic == XFS_ATTR_LEAF_MAGIC || + drop_blk->magic == XFS_DIR2_LEAFN_MAGIC); + + /* + * Walk back up the tree joining/deallocating as necessary. + * When we stop dropping blocks, break out. + */ + for ( ; state->path.active >= 2; drop_blk--, save_blk--, + state->path.active--) { + /* + * See if we can combine the block with a neighbor. + * (action == 0) => no options, just leave + * (action == 1) => coalesce, then unlink + * (action == 2) => block empty, unlink it + */ + switch (drop_blk->magic) { + case XFS_ATTR_LEAF_MAGIC: + error = xfs_attr3_leaf_toosmall(state, &action); + if (error) + return error; + if (action == 0) + return 0; + xfs_attr3_leaf_unbalance(state, drop_blk, save_blk); + break; + case XFS_DIR2_LEAFN_MAGIC: + error = xfs_dir2_leafn_toosmall(state, &action); + if (error) + return error; + if (action == 0) + return 0; + xfs_dir2_leafn_unbalance(state, drop_blk, save_blk); + break; + case XFS_DA_NODE_MAGIC: + /* + * Remove the offending node, fixup hashvals, + * check for a toosmall neighbor. + */ + xfs_da3_node_remove(state, drop_blk); + xfs_da3_fixhashpath(state, &state->path); + error = xfs_da3_node_toosmall(state, &action); + if (error) + return error; + if (action == 0) + return 0; + xfs_da3_node_unbalance(state, drop_blk, save_blk); + break; + } + xfs_da3_fixhashpath(state, &state->altpath); + error = xfs_da3_blk_unlink(state, drop_blk, save_blk); + xfs_da_state_kill_altpath(state); + if (error) + return error; + error = xfs_da_shrink_inode(state->args, drop_blk->blkno, + drop_blk->bp); + drop_blk->bp = NULL; + if (error) + return error; + } + /* + * We joined all the way to the top. If it turns out that + * we only have one entry in the root, make the child block + * the new root. + */ + xfs_da3_node_remove(state, drop_blk); + xfs_da3_fixhashpath(state, &state->path); + error = xfs_da3_root_join(state, &state->path.blk[0]); + return error; +} + +#ifdef DEBUG +static void +xfs_da_blkinfo_onlychild_validate(struct xfs_da_blkinfo *blkinfo, __u16 level) +{ + __be16 magic = blkinfo->magic; + + if (level == 1) { + ASSERT(magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) || + magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC) || + magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC) || + magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)); + } else { + ASSERT(magic == cpu_to_be16(XFS_DA_NODE_MAGIC) || + magic == cpu_to_be16(XFS_DA3_NODE_MAGIC)); + } + ASSERT(!blkinfo->forw); + ASSERT(!blkinfo->back); +} +#else /* !DEBUG */ +#define xfs_da_blkinfo_onlychild_validate(blkinfo, level) +#endif /* !DEBUG */ + +/* + * We have only one entry in the root. Copy the only remaining child of + * the old root to block 0 as the new root node. + */ +STATIC int +xfs_da3_root_join( + struct xfs_da_state *state, + struct xfs_da_state_blk *root_blk) +{ + struct xfs_da_intnode *oldroot; + struct xfs_da_args *args; + xfs_dablk_t child; + struct xfs_buf *bp; + struct xfs_da3_icnode_hdr oldroothdr; + struct xfs_da_node_entry *btree; + int error; + struct xfs_inode *dp = state->args->dp; + + trace_xfs_da_root_join(state->args); + + ASSERT(root_blk->magic == XFS_DA_NODE_MAGIC); + + args = state->args; + oldroot = root_blk->bp->b_addr; + dp->d_ops->node_hdr_from_disk(&oldroothdr, oldroot); + ASSERT(oldroothdr.forw == 0); + ASSERT(oldroothdr.back == 0); + + /* + * If the root has more than one child, then don't do anything. + */ + if (oldroothdr.count > 1) + return 0; + + /* + * Read in the (only) child block, then copy those bytes into + * the root block's buffer and free the original child block. + */ + btree = dp->d_ops->node_tree_p(oldroot); + child = be32_to_cpu(btree[0].before); + ASSERT(child != 0); + error = xfs_da3_node_read(args->trans, dp, child, -1, &bp, + args->whichfork); + if (error) + return error; + xfs_da_blkinfo_onlychild_validate(bp->b_addr, oldroothdr.level); + + /* + * This could be copying a leaf back into the root block in the case of + * there only being a single leaf block left in the tree. Hence we have + * to update the b_ops pointer as well to match the buffer type change + * that could occur. For dir3 blocks we also need to update the block + * number in the buffer header. + */ + memcpy(root_blk->bp->b_addr, bp->b_addr, args->geo->blksize); + root_blk->bp->b_ops = bp->b_ops; + xfs_trans_buf_copy_type(root_blk->bp, bp); + if (oldroothdr.magic == XFS_DA3_NODE_MAGIC) { + struct xfs_da3_blkinfo *da3 = root_blk->bp->b_addr; + da3->blkno = cpu_to_be64(root_blk->bp->b_bn); + } + xfs_trans_log_buf(args->trans, root_blk->bp, 0, + args->geo->blksize - 1); + error = xfs_da_shrink_inode(args, child, bp); + return error; +} + +/* + * Check a node block and its neighbors to see if the block should be + * collapsed into one or the other neighbor. Always keep the block + * with the smaller block number. + * If the current block is over 50% full, don't try to join it, return 0. + * If the block is empty, fill in the state structure and return 2. + * If it can be collapsed, fill in the state structure and return 1. + * If nothing can be done, return 0. + */ +STATIC int +xfs_da3_node_toosmall( + struct xfs_da_state *state, + int *action) +{ + struct xfs_da_intnode *node; + struct xfs_da_state_blk *blk; + struct xfs_da_blkinfo *info; + xfs_dablk_t blkno; + struct xfs_buf *bp; + struct xfs_da3_icnode_hdr nodehdr; + int count; + int forward; + int error; + int retval; + int i; + struct xfs_inode *dp = state->args->dp; + + trace_xfs_da_node_toosmall(state->args); + + /* + * Check for the degenerate case of the block being over 50% full. + * If so, it's not worth even looking to see if we might be able + * to coalesce with a sibling. + */ + blk = &state->path.blk[ state->path.active-1 ]; + info = blk->bp->b_addr; + node = (xfs_da_intnode_t *)info; + dp->d_ops->node_hdr_from_disk(&nodehdr, node); + if (nodehdr.count > (state->args->geo->node_ents >> 1)) { + *action = 0; /* blk over 50%, don't try to join */ + return 0; /* blk over 50%, don't try to join */ + } + + /* + * Check for the degenerate case of the block being empty. + * If the block is empty, we'll simply delete it, no need to + * coalesce it with a sibling block. We choose (arbitrarily) + * to merge with the forward block unless it is NULL. + */ + if (nodehdr.count == 0) { + /* + * Make altpath point to the block we want to keep and + * path point to the block we want to drop (this one). + */ + forward = (info->forw != 0); + memcpy(&state->altpath, &state->path, sizeof(state->path)); + error = xfs_da3_path_shift(state, &state->altpath, forward, + 0, &retval); + if (error) + return error; + if (retval) { + *action = 0; + } else { + *action = 2; + } + return 0; + } + + /* + * Examine each sibling block to see if we can coalesce with + * at least 25% free space to spare. We need to figure out + * whether to merge with the forward or the backward block. + * We prefer coalescing with the lower numbered sibling so as + * to shrink a directory over time. + */ + count = state->args->geo->node_ents; + count -= state->args->geo->node_ents >> 2; + count -= nodehdr.count; + + /* start with smaller blk num */ + forward = nodehdr.forw < nodehdr.back; + for (i = 0; i < 2; forward = !forward, i++) { + struct xfs_da3_icnode_hdr thdr; + if (forward) + blkno = nodehdr.forw; + else + blkno = nodehdr.back; + if (blkno == 0) + continue; + error = xfs_da3_node_read(state->args->trans, dp, + blkno, -1, &bp, state->args->whichfork); + if (error) + return error; + + node = bp->b_addr; + dp->d_ops->node_hdr_from_disk(&thdr, node); + xfs_trans_brelse(state->args->trans, bp); + + if (count - thdr.count >= 0) + break; /* fits with at least 25% to spare */ + } + if (i >= 2) { + *action = 0; + return 0; + } + + /* + * Make altpath point to the block we want to keep (the lower + * numbered block) and path point to the block we want to drop. + */ + memcpy(&state->altpath, &state->path, sizeof(state->path)); + if (blkno < blk->blkno) { + error = xfs_da3_path_shift(state, &state->altpath, forward, + 0, &retval); + } else { + error = xfs_da3_path_shift(state, &state->path, forward, + 0, &retval); + } + if (error) + return error; + if (retval) { + *action = 0; + return 0; + } + *action = 1; + return 0; +} + +/* + * Pick up the last hashvalue from an intermediate node. + */ +STATIC uint +xfs_da3_node_lasthash( + struct xfs_inode *dp, + struct xfs_buf *bp, + int *count) +{ + struct xfs_da_intnode *node; + struct xfs_da_node_entry *btree; + struct xfs_da3_icnode_hdr nodehdr; + + node = bp->b_addr; + dp->d_ops->node_hdr_from_disk(&nodehdr, node); + if (count) + *count = nodehdr.count; + if (!nodehdr.count) + return 0; + btree = dp->d_ops->node_tree_p(node); + return be32_to_cpu(btree[nodehdr.count - 1].hashval); +} + +/* + * Walk back up the tree adjusting hash values as necessary, + * when we stop making changes, return. + */ +void +xfs_da3_fixhashpath( + struct xfs_da_state *state, + struct xfs_da_state_path *path) +{ + struct xfs_da_state_blk *blk; + struct xfs_da_intnode *node; + struct xfs_da_node_entry *btree; + xfs_dahash_t lasthash=0; + int level; + int count; + struct xfs_inode *dp = state->args->dp; + + trace_xfs_da_fixhashpath(state->args); + + level = path->active-1; + blk = &path->blk[ level ]; + switch (blk->magic) { + case XFS_ATTR_LEAF_MAGIC: + lasthash = xfs_attr_leaf_lasthash(blk->bp, &count); + if (count == 0) + return; + break; + case XFS_DIR2_LEAFN_MAGIC: + lasthash = xfs_dir2_leafn_lasthash(dp, blk->bp, &count); + if (count == 0) + return; + break; + case XFS_DA_NODE_MAGIC: + lasthash = xfs_da3_node_lasthash(dp, blk->bp, &count); + if (count == 0) + return; + break; + } + for (blk--, level--; level >= 0; blk--, level--) { + struct xfs_da3_icnode_hdr nodehdr; + + node = blk->bp->b_addr; + dp->d_ops->node_hdr_from_disk(&nodehdr, node); + btree = dp->d_ops->node_tree_p(node); + if (be32_to_cpu(btree[blk->index].hashval) == lasthash) + break; + blk->hashval = lasthash; + btree[blk->index].hashval = cpu_to_be32(lasthash); + xfs_trans_log_buf(state->args->trans, blk->bp, + XFS_DA_LOGRANGE(node, &btree[blk->index], + sizeof(*btree))); + + lasthash = be32_to_cpu(btree[nodehdr.count - 1].hashval); + } +} + +/* + * Remove an entry from an intermediate node. + */ +STATIC void +xfs_da3_node_remove( + struct xfs_da_state *state, + struct xfs_da_state_blk *drop_blk) +{ + struct xfs_da_intnode *node; + struct xfs_da3_icnode_hdr nodehdr; + struct xfs_da_node_entry *btree; + int index; + int tmp; + struct xfs_inode *dp = state->args->dp; + + trace_xfs_da_node_remove(state->args); + + node = drop_blk->bp->b_addr; + dp->d_ops->node_hdr_from_disk(&nodehdr, node); + ASSERT(drop_blk->index < nodehdr.count); + ASSERT(drop_blk->index >= 0); + + /* + * Copy over the offending entry, or just zero it out. + */ + index = drop_blk->index; + btree = dp->d_ops->node_tree_p(node); + if (index < nodehdr.count - 1) { + tmp = nodehdr.count - index - 1; + tmp *= (uint)sizeof(xfs_da_node_entry_t); + memmove(&btree[index], &btree[index + 1], tmp); + xfs_trans_log_buf(state->args->trans, drop_blk->bp, + XFS_DA_LOGRANGE(node, &btree[index], tmp)); + index = nodehdr.count - 1; + } + memset(&btree[index], 0, sizeof(xfs_da_node_entry_t)); + xfs_trans_log_buf(state->args->trans, drop_blk->bp, + XFS_DA_LOGRANGE(node, &btree[index], sizeof(btree[index]))); + nodehdr.count -= 1; + dp->d_ops->node_hdr_to_disk(node, &nodehdr); + xfs_trans_log_buf(state->args->trans, drop_blk->bp, + XFS_DA_LOGRANGE(node, &node->hdr, dp->d_ops->node_hdr_size)); + + /* + * Copy the last hash value from the block to propagate upwards. + */ + drop_blk->hashval = be32_to_cpu(btree[index - 1].hashval); +} + +/* + * Unbalance the elements between two intermediate nodes, + * move all Btree elements from one node into another. + */ +STATIC void +xfs_da3_node_unbalance( + struct xfs_da_state *state, + struct xfs_da_state_blk *drop_blk, + struct xfs_da_state_blk *save_blk) +{ + struct xfs_da_intnode *drop_node; + struct xfs_da_intnode *save_node; + struct xfs_da_node_entry *drop_btree; + struct xfs_da_node_entry *save_btree; + struct xfs_da3_icnode_hdr drop_hdr; + struct xfs_da3_icnode_hdr save_hdr; + struct xfs_trans *tp; + int sindex; + int tmp; + struct xfs_inode *dp = state->args->dp; + + trace_xfs_da_node_unbalance(state->args); + + drop_node = drop_blk->bp->b_addr; + save_node = save_blk->bp->b_addr; + dp->d_ops->node_hdr_from_disk(&drop_hdr, drop_node); + dp->d_ops->node_hdr_from_disk(&save_hdr, save_node); + drop_btree = dp->d_ops->node_tree_p(drop_node); + save_btree = dp->d_ops->node_tree_p(save_node); + tp = state->args->trans; + + /* + * If the dying block has lower hashvals, then move all the + * elements in the remaining block up to make a hole. + */ + if ((be32_to_cpu(drop_btree[0].hashval) < + be32_to_cpu(save_btree[0].hashval)) || + (be32_to_cpu(drop_btree[drop_hdr.count - 1].hashval) < + be32_to_cpu(save_btree[save_hdr.count - 1].hashval))) { + /* XXX: check this - is memmove dst correct? */ + tmp = save_hdr.count * sizeof(xfs_da_node_entry_t); + memmove(&save_btree[drop_hdr.count], &save_btree[0], tmp); + + sindex = 0; + xfs_trans_log_buf(tp, save_blk->bp, + XFS_DA_LOGRANGE(save_node, &save_btree[0], + (save_hdr.count + drop_hdr.count) * + sizeof(xfs_da_node_entry_t))); + } else { + sindex = save_hdr.count; + xfs_trans_log_buf(tp, save_blk->bp, + XFS_DA_LOGRANGE(save_node, &save_btree[sindex], + drop_hdr.count * sizeof(xfs_da_node_entry_t))); + } + + /* + * Move all the B-tree elements from drop_blk to save_blk. + */ + tmp = drop_hdr.count * (uint)sizeof(xfs_da_node_entry_t); + memcpy(&save_btree[sindex], &drop_btree[0], tmp); + save_hdr.count += drop_hdr.count; + + dp->d_ops->node_hdr_to_disk(save_node, &save_hdr); + xfs_trans_log_buf(tp, save_blk->bp, + XFS_DA_LOGRANGE(save_node, &save_node->hdr, + dp->d_ops->node_hdr_size)); + + /* + * Save the last hashval in the remaining block for upward propagation. + */ + save_blk->hashval = be32_to_cpu(save_btree[save_hdr.count - 1].hashval); +} + +/*======================================================================== + * Routines used for finding things in the Btree. + *========================================================================*/ + +/* + * Walk down the Btree looking for a particular filename, filling + * in the state structure as we go. + * + * We will set the state structure to point to each of the elements + * in each of the nodes where either the hashval is or should be. + * + * We support duplicate hashval's so for each entry in the current + * node that could contain the desired hashval, descend. This is a + * pruned depth-first tree search. + */ +int /* error */ +xfs_da3_node_lookup_int( + struct xfs_da_state *state, + int *result) +{ + struct xfs_da_state_blk *blk; + struct xfs_da_blkinfo *curr; + struct xfs_da_intnode *node; + struct xfs_da_node_entry *btree; + struct xfs_da3_icnode_hdr nodehdr; + struct xfs_da_args *args; + xfs_dablk_t blkno; + xfs_dahash_t hashval; + xfs_dahash_t btreehashval; + int probe; + int span; + int max; + int error; + int retval; + struct xfs_inode *dp = state->args->dp; + + args = state->args; + + /* + * Descend thru the B-tree searching each level for the right + * node to use, until the right hashval is found. + */ + blkno = (args->whichfork == XFS_DATA_FORK)? args->geo->leafblk : 0; + for (blk = &state->path.blk[0], state->path.active = 1; + state->path.active <= XFS_DA_NODE_MAXDEPTH; + blk++, state->path.active++) { + /* + * Read the next node down in the tree. + */ + blk->blkno = blkno; + error = xfs_da3_node_read(args->trans, args->dp, blkno, + -1, &blk->bp, args->whichfork); + if (error) { + blk->blkno = 0; + state->path.active--; + return error; + } + curr = blk->bp->b_addr; + blk->magic = be16_to_cpu(curr->magic); + + if (blk->magic == XFS_ATTR_LEAF_MAGIC || + blk->magic == XFS_ATTR3_LEAF_MAGIC) { + blk->magic = XFS_ATTR_LEAF_MAGIC; + blk->hashval = xfs_attr_leaf_lasthash(blk->bp, NULL); + break; + } + + if (blk->magic == XFS_DIR2_LEAFN_MAGIC || + blk->magic == XFS_DIR3_LEAFN_MAGIC) { + blk->magic = XFS_DIR2_LEAFN_MAGIC; + blk->hashval = xfs_dir2_leafn_lasthash(args->dp, + blk->bp, NULL); + break; + } + + blk->magic = XFS_DA_NODE_MAGIC; + + + /* + * Search an intermediate node for a match. + */ + node = blk->bp->b_addr; + dp->d_ops->node_hdr_from_disk(&nodehdr, node); + btree = dp->d_ops->node_tree_p(node); + + max = nodehdr.count; + blk->hashval = be32_to_cpu(btree[max - 1].hashval); + + /* + * Binary search. (note: small blocks will skip loop) + */ + probe = span = max / 2; + hashval = args->hashval; + while (span > 4) { + span /= 2; + btreehashval = be32_to_cpu(btree[probe].hashval); + if (btreehashval < hashval) + probe += span; + else if (btreehashval > hashval) + probe -= span; + else + break; + } + ASSERT((probe >= 0) && (probe < max)); + ASSERT((span <= 4) || + (be32_to_cpu(btree[probe].hashval) == hashval)); + + /* + * Since we may have duplicate hashval's, find the first + * matching hashval in the node. + */ + while (probe > 0 && + be32_to_cpu(btree[probe].hashval) >= hashval) { + probe--; + } + while (probe < max && + be32_to_cpu(btree[probe].hashval) < hashval) { + probe++; + } + + /* + * Pick the right block to descend on. + */ + if (probe == max) { + blk->index = max - 1; + blkno = be32_to_cpu(btree[max - 1].before); + } else { + blk->index = probe; + blkno = be32_to_cpu(btree[probe].before); + } + } + + /* + * A leaf block that ends in the hashval that we are interested in + * (final hashval == search hashval) means that the next block may + * contain more entries with the same hashval, shift upward to the + * next leaf and keep searching. + */ + for (;;) { + if (blk->magic == XFS_DIR2_LEAFN_MAGIC) { + retval = xfs_dir2_leafn_lookup_int(blk->bp, args, + &blk->index, state); + } else if (blk->magic == XFS_ATTR_LEAF_MAGIC) { + retval = xfs_attr3_leaf_lookup_int(blk->bp, args); + blk->index = args->index; + args->blkno = blk->blkno; + } else { + ASSERT(0); + return -EFSCORRUPTED; + } + if (((retval == -ENOENT) || (retval == -ENOATTR)) && + (blk->hashval == args->hashval)) { + error = xfs_da3_path_shift(state, &state->path, 1, 1, + &retval); + if (error) + return error; + if (retval == 0) { + continue; + } else if (blk->magic == XFS_ATTR_LEAF_MAGIC) { + /* path_shift() gives ENOENT */ + retval = -ENOATTR; + } + } + break; + } + *result = retval; + return 0; +} + +/*======================================================================== + * Utility routines. + *========================================================================*/ + +/* + * Compare two intermediate nodes for "order". + */ +STATIC int +xfs_da3_node_order( + struct xfs_inode *dp, + struct xfs_buf *node1_bp, + struct xfs_buf *node2_bp) +{ + struct xfs_da_intnode *node1; + struct xfs_da_intnode *node2; + struct xfs_da_node_entry *btree1; + struct xfs_da_node_entry *btree2; + struct xfs_da3_icnode_hdr node1hdr; + struct xfs_da3_icnode_hdr node2hdr; + + node1 = node1_bp->b_addr; + node2 = node2_bp->b_addr; + dp->d_ops->node_hdr_from_disk(&node1hdr, node1); + dp->d_ops->node_hdr_from_disk(&node2hdr, node2); + btree1 = dp->d_ops->node_tree_p(node1); + btree2 = dp->d_ops->node_tree_p(node2); + + if (node1hdr.count > 0 && node2hdr.count > 0 && + ((be32_to_cpu(btree2[0].hashval) < be32_to_cpu(btree1[0].hashval)) || + (be32_to_cpu(btree2[node2hdr.count - 1].hashval) < + be32_to_cpu(btree1[node1hdr.count - 1].hashval)))) { + return 1; + } + return 0; +} + +/* + * Link a new block into a doubly linked list of blocks (of whatever type). + */ +int /* error */ +xfs_da3_blk_link( + struct xfs_da_state *state, + struct xfs_da_state_blk *old_blk, + struct xfs_da_state_blk *new_blk) +{ + struct xfs_da_blkinfo *old_info; + struct xfs_da_blkinfo *new_info; + struct xfs_da_blkinfo *tmp_info; + struct xfs_da_args *args; + struct xfs_buf *bp; + int before = 0; + int error; + struct xfs_inode *dp = state->args->dp; + + /* + * Set up environment. + */ + args = state->args; + ASSERT(args != NULL); + old_info = old_blk->bp->b_addr; + new_info = new_blk->bp->b_addr; + ASSERT(old_blk->magic == XFS_DA_NODE_MAGIC || + old_blk->magic == XFS_DIR2_LEAFN_MAGIC || + old_blk->magic == XFS_ATTR_LEAF_MAGIC); + + switch (old_blk->magic) { + case XFS_ATTR_LEAF_MAGIC: + before = xfs_attr_leaf_order(old_blk->bp, new_blk->bp); + break; + case XFS_DIR2_LEAFN_MAGIC: + before = xfs_dir2_leafn_order(dp, old_blk->bp, new_blk->bp); + break; + case XFS_DA_NODE_MAGIC: + before = xfs_da3_node_order(dp, old_blk->bp, new_blk->bp); + break; + } + + /* + * Link blocks in appropriate order. + */ + if (before) { + /* + * Link new block in before existing block. + */ + trace_xfs_da_link_before(args); + new_info->forw = cpu_to_be32(old_blk->blkno); + new_info->back = old_info->back; + if (old_info->back) { + error = xfs_da3_node_read(args->trans, dp, + be32_to_cpu(old_info->back), + -1, &bp, args->whichfork); + if (error) + return error; + ASSERT(bp != NULL); + tmp_info = bp->b_addr; + ASSERT(tmp_info->magic == old_info->magic); + ASSERT(be32_to_cpu(tmp_info->forw) == old_blk->blkno); + tmp_info->forw = cpu_to_be32(new_blk->blkno); + xfs_trans_log_buf(args->trans, bp, 0, sizeof(*tmp_info)-1); + } + old_info->back = cpu_to_be32(new_blk->blkno); + } else { + /* + * Link new block in after existing block. + */ + trace_xfs_da_link_after(args); + new_info->forw = old_info->forw; + new_info->back = cpu_to_be32(old_blk->blkno); + if (old_info->forw) { + error = xfs_da3_node_read(args->trans, dp, + be32_to_cpu(old_info->forw), + -1, &bp, args->whichfork); + if (error) + return error; + ASSERT(bp != NULL); + tmp_info = bp->b_addr; + ASSERT(tmp_info->magic == old_info->magic); + ASSERT(be32_to_cpu(tmp_info->back) == old_blk->blkno); + tmp_info->back = cpu_to_be32(new_blk->blkno); + xfs_trans_log_buf(args->trans, bp, 0, sizeof(*tmp_info)-1); + } + old_info->forw = cpu_to_be32(new_blk->blkno); + } + + xfs_trans_log_buf(args->trans, old_blk->bp, 0, sizeof(*tmp_info) - 1); + xfs_trans_log_buf(args->trans, new_blk->bp, 0, sizeof(*tmp_info) - 1); + return 0; +} + +/* + * Unlink a block from a doubly linked list of blocks. + */ +STATIC int /* error */ +xfs_da3_blk_unlink( + struct xfs_da_state *state, + struct xfs_da_state_blk *drop_blk, + struct xfs_da_state_blk *save_blk) +{ + struct xfs_da_blkinfo *drop_info; + struct xfs_da_blkinfo *save_info; + struct xfs_da_blkinfo *tmp_info; + struct xfs_da_args *args; + struct xfs_buf *bp; + int error; + + /* + * Set up environment. + */ + args = state->args; + ASSERT(args != NULL); + save_info = save_blk->bp->b_addr; + drop_info = drop_blk->bp->b_addr; + ASSERT(save_blk->magic == XFS_DA_NODE_MAGIC || + save_blk->magic == XFS_DIR2_LEAFN_MAGIC || + save_blk->magic == XFS_ATTR_LEAF_MAGIC); + ASSERT(save_blk->magic == drop_blk->magic); + ASSERT((be32_to_cpu(save_info->forw) == drop_blk->blkno) || + (be32_to_cpu(save_info->back) == drop_blk->blkno)); + ASSERT((be32_to_cpu(drop_info->forw) == save_blk->blkno) || + (be32_to_cpu(drop_info->back) == save_blk->blkno)); + + /* + * Unlink the leaf block from the doubly linked chain of leaves. + */ + if (be32_to_cpu(save_info->back) == drop_blk->blkno) { + trace_xfs_da_unlink_back(args); + save_info->back = drop_info->back; + if (drop_info->back) { + error = xfs_da3_node_read(args->trans, args->dp, + be32_to_cpu(drop_info->back), + -1, &bp, args->whichfork); + if (error) + return error; + ASSERT(bp != NULL); + tmp_info = bp->b_addr; + ASSERT(tmp_info->magic == save_info->magic); + ASSERT(be32_to_cpu(tmp_info->forw) == drop_blk->blkno); + tmp_info->forw = cpu_to_be32(save_blk->blkno); + xfs_trans_log_buf(args->trans, bp, 0, + sizeof(*tmp_info) - 1); + } + } else { + trace_xfs_da_unlink_forward(args); + save_info->forw = drop_info->forw; + if (drop_info->forw) { + error = xfs_da3_node_read(args->trans, args->dp, + be32_to_cpu(drop_info->forw), + -1, &bp, args->whichfork); + if (error) + return error; + ASSERT(bp != NULL); + tmp_info = bp->b_addr; + ASSERT(tmp_info->magic == save_info->magic); + ASSERT(be32_to_cpu(tmp_info->back) == drop_blk->blkno); + tmp_info->back = cpu_to_be32(save_blk->blkno); + xfs_trans_log_buf(args->trans, bp, 0, + sizeof(*tmp_info) - 1); + } + } + + xfs_trans_log_buf(args->trans, save_blk->bp, 0, sizeof(*save_info) - 1); + return 0; +} + +/* + * Move a path "forward" or "!forward" one block at the current level. + * + * This routine will adjust a "path" to point to the next block + * "forward" (higher hashvalues) or "!forward" (lower hashvals) in the + * Btree, including updating pointers to the intermediate nodes between + * the new bottom and the root. + */ +int /* error */ +xfs_da3_path_shift( + struct xfs_da_state *state, + struct xfs_da_state_path *path, + int forward, + int release, + int *result) +{ + struct xfs_da_state_blk *blk; + struct xfs_da_blkinfo *info; + struct xfs_da_intnode *node; + struct xfs_da_args *args; + struct xfs_da_node_entry *btree; + struct xfs_da3_icnode_hdr nodehdr; + xfs_dablk_t blkno = 0; + int level; + int error; + struct xfs_inode *dp = state->args->dp; + + trace_xfs_da_path_shift(state->args); + + /* + * Roll up the Btree looking for the first block where our + * current index is not at the edge of the block. Note that + * we skip the bottom layer because we want the sibling block. + */ + args = state->args; + ASSERT(args != NULL); + ASSERT(path != NULL); + ASSERT((path->active > 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); + level = (path->active-1) - 1; /* skip bottom layer in path */ + for (blk = &path->blk[level]; level >= 0; blk--, level--) { + node = blk->bp->b_addr; + dp->d_ops->node_hdr_from_disk(&nodehdr, node); + btree = dp->d_ops->node_tree_p(node); + + if (forward && (blk->index < nodehdr.count - 1)) { + blk->index++; + blkno = be32_to_cpu(btree[blk->index].before); + break; + } else if (!forward && (blk->index > 0)) { + blk->index--; + blkno = be32_to_cpu(btree[blk->index].before); + break; + } + } + if (level < 0) { + *result = -ENOENT; /* we're out of our tree */ + ASSERT(args->op_flags & XFS_DA_OP_OKNOENT); + return 0; + } + + /* + * Roll down the edge of the subtree until we reach the + * same depth we were at originally. + */ + for (blk++, level++; level < path->active; blk++, level++) { + /* + * Release the old block. + * (if it's dirty, trans won't actually let go) + */ + if (release) + xfs_trans_brelse(args->trans, blk->bp); + + /* + * Read the next child block. + */ + blk->blkno = blkno; + error = xfs_da3_node_read(args->trans, dp, blkno, -1, + &blk->bp, args->whichfork); + if (error) + return error; + info = blk->bp->b_addr; + ASSERT(info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC) || + info->magic == cpu_to_be16(XFS_DA3_NODE_MAGIC) || + info->magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) || + info->magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC) || + info->magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC) || + info->magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)); + + + /* + * Note: we flatten the magic number to a single type so we + * don't have to compare against crc/non-crc types elsewhere. + */ + switch (be16_to_cpu(info->magic)) { + case XFS_DA_NODE_MAGIC: + case XFS_DA3_NODE_MAGIC: + blk->magic = XFS_DA_NODE_MAGIC; + node = (xfs_da_intnode_t *)info; + dp->d_ops->node_hdr_from_disk(&nodehdr, node); + btree = dp->d_ops->node_tree_p(node); + blk->hashval = be32_to_cpu(btree[nodehdr.count - 1].hashval); + if (forward) + blk->index = 0; + else + blk->index = nodehdr.count - 1; + blkno = be32_to_cpu(btree[blk->index].before); + break; + case XFS_ATTR_LEAF_MAGIC: + case XFS_ATTR3_LEAF_MAGIC: + blk->magic = XFS_ATTR_LEAF_MAGIC; + ASSERT(level == path->active-1); + blk->index = 0; + blk->hashval = xfs_attr_leaf_lasthash(blk->bp, NULL); + break; + case XFS_DIR2_LEAFN_MAGIC: + case XFS_DIR3_LEAFN_MAGIC: + blk->magic = XFS_DIR2_LEAFN_MAGIC; + ASSERT(level == path->active-1); + blk->index = 0; + blk->hashval = xfs_dir2_leafn_lasthash(args->dp, + blk->bp, NULL); + break; + default: + ASSERT(0); + break; + } + } + *result = 0; + return 0; +} + + +/*======================================================================== + * Utility routines. + *========================================================================*/ + +/* + * Implement a simple hash on a character string. + * Rotate the hash value by 7 bits, then XOR each character in. + * This is implemented with some source-level loop unrolling. + */ +xfs_dahash_t +xfs_da_hashname(const __uint8_t *name, int namelen) +{ + xfs_dahash_t hash; + + /* + * Do four characters at a time as long as we can. + */ + for (hash = 0; namelen >= 4; namelen -= 4, name += 4) + hash = (name[0] << 21) ^ (name[1] << 14) ^ (name[2] << 7) ^ + (name[3] << 0) ^ rol32(hash, 7 * 4); + + /* + * Now do the rest of the characters. + */ + switch (namelen) { + case 3: + return (name[0] << 14) ^ (name[1] << 7) ^ (name[2] << 0) ^ + rol32(hash, 7 * 3); + case 2: + return (name[0] << 7) ^ (name[1] << 0) ^ rol32(hash, 7 * 2); + case 1: + return (name[0] << 0) ^ rol32(hash, 7 * 1); + default: /* case 0: */ + return hash; + } +} + +enum xfs_dacmp +xfs_da_compname( + struct xfs_da_args *args, + const unsigned char *name, + int len) +{ + return (args->namelen == len && memcmp(args->name, name, len) == 0) ? + XFS_CMP_EXACT : XFS_CMP_DIFFERENT; +} + +static xfs_dahash_t +xfs_default_hashname( + struct xfs_name *name) +{ + return xfs_da_hashname(name->name, name->len); +} + +const struct xfs_nameops xfs_default_nameops = { + .hashname = xfs_default_hashname, + .compname = xfs_da_compname +}; + +int +xfs_da_grow_inode_int( + struct xfs_da_args *args, + xfs_fileoff_t *bno, + int count) +{ + struct xfs_trans *tp = args->trans; + struct xfs_inode *dp = args->dp; + int w = args->whichfork; + xfs_rfsblock_t nblks = dp->i_d.di_nblocks; + struct xfs_bmbt_irec map, *mapp; + int nmap, error, got, i, mapi; + + /* + * Find a spot in the file space to put the new block. + */ + error = xfs_bmap_first_unused(tp, dp, count, bno, w); + if (error) + return error; + + /* + * Try mapping it in one filesystem block. + */ + nmap = 1; + ASSERT(args->firstblock != NULL); + error = xfs_bmapi_write(tp, dp, *bno, count, + xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA|XFS_BMAPI_CONTIG, + args->firstblock, args->total, &map, &nmap, + args->flist); + if (error) + return error; + + ASSERT(nmap <= 1); + if (nmap == 1) { + mapp = ↦ + mapi = 1; + } else if (nmap == 0 && count > 1) { + xfs_fileoff_t b; + int c; + + /* + * If we didn't get it and the block might work if fragmented, + * try without the CONTIG flag. Loop until we get it all. + */ + mapp = kmem_alloc(sizeof(*mapp) * count, KM_SLEEP); + for (b = *bno, mapi = 0; b < *bno + count; ) { + nmap = MIN(XFS_BMAP_MAX_NMAP, count); + c = (int)(*bno + count - b); + error = xfs_bmapi_write(tp, dp, b, c, + xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA, + args->firstblock, args->total, + &mapp[mapi], &nmap, args->flist); + if (error) + goto out_free_map; + if (nmap < 1) + break; + mapi += nmap; + b = mapp[mapi - 1].br_startoff + + mapp[mapi - 1].br_blockcount; + } + } else { + mapi = 0; + mapp = NULL; + } + + /* + * Count the blocks we got, make sure it matches the total. + */ + for (i = 0, got = 0; i < mapi; i++) + got += mapp[i].br_blockcount; + if (got != count || mapp[0].br_startoff != *bno || + mapp[mapi - 1].br_startoff + mapp[mapi - 1].br_blockcount != + *bno + count) { + error = -ENOSPC; + goto out_free_map; + } + + /* account for newly allocated blocks in reserved blocks total */ + args->total -= dp->i_d.di_nblocks - nblks; + +out_free_map: + if (mapp != &map) + kmem_free(mapp); + return error; +} + +/* + * Add a block to the btree ahead of the file. + * Return the new block number to the caller. + */ +int +xfs_da_grow_inode( + struct xfs_da_args *args, + xfs_dablk_t *new_blkno) +{ + xfs_fileoff_t bno; + int error; + + trace_xfs_da_grow_inode(args); + + bno = args->geo->leafblk; + error = xfs_da_grow_inode_int(args, &bno, args->geo->fsbcount); + if (!error) + *new_blkno = (xfs_dablk_t)bno; + return error; +} + +/* + * Ick. We need to always be able to remove a btree block, even + * if there's no space reservation because the filesystem is full. + * This is called if xfs_bunmapi on a btree block fails due to ENOSPC. + * It swaps the target block with the last block in the file. The + * last block in the file can always be removed since it can't cause + * a bmap btree split to do that. + */ +STATIC int +xfs_da3_swap_lastblock( + struct xfs_da_args *args, + xfs_dablk_t *dead_blknop, + struct xfs_buf **dead_bufp) +{ + struct xfs_da_blkinfo *dead_info; + struct xfs_da_blkinfo *sib_info; + struct xfs_da_intnode *par_node; + struct xfs_da_intnode *dead_node; + struct xfs_dir2_leaf *dead_leaf2; + struct xfs_da_node_entry *btree; + struct xfs_da3_icnode_hdr par_hdr; + struct xfs_inode *dp; + struct xfs_trans *tp; + struct xfs_mount *mp; + struct xfs_buf *dead_buf; + struct xfs_buf *last_buf; + struct xfs_buf *sib_buf; + struct xfs_buf *par_buf; + xfs_dahash_t dead_hash; + xfs_fileoff_t lastoff; + xfs_dablk_t dead_blkno; + xfs_dablk_t last_blkno; + xfs_dablk_t sib_blkno; + xfs_dablk_t par_blkno; + int error; + int w; + int entno; + int level; + int dead_level; + + trace_xfs_da_swap_lastblock(args); + + dead_buf = *dead_bufp; + dead_blkno = *dead_blknop; + tp = args->trans; + dp = args->dp; + w = args->whichfork; + ASSERT(w == XFS_DATA_FORK); + mp = dp->i_mount; + lastoff = args->geo->freeblk; + error = xfs_bmap_last_before(tp, dp, &lastoff, w); + if (error) + return error; + if (unlikely(lastoff == 0)) { + XFS_ERROR_REPORT("xfs_da_swap_lastblock(1)", XFS_ERRLEVEL_LOW, + mp); + return -EFSCORRUPTED; + } + /* + * Read the last block in the btree space. + */ + last_blkno = (xfs_dablk_t)lastoff - args->geo->fsbcount; + error = xfs_da3_node_read(tp, dp, last_blkno, -1, &last_buf, w); + if (error) + return error; + /* + * Copy the last block into the dead buffer and log it. + */ + memcpy(dead_buf->b_addr, last_buf->b_addr, args->geo->blksize); + xfs_trans_log_buf(tp, dead_buf, 0, args->geo->blksize - 1); + dead_info = dead_buf->b_addr; + /* + * Get values from the moved block. + */ + if (dead_info->magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) || + dead_info->magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) { + struct xfs_dir3_icleaf_hdr leafhdr; + struct xfs_dir2_leaf_entry *ents; + + dead_leaf2 = (xfs_dir2_leaf_t *)dead_info; + dp->d_ops->leaf_hdr_from_disk(&leafhdr, dead_leaf2); + ents = dp->d_ops->leaf_ents_p(dead_leaf2); + dead_level = 0; + dead_hash = be32_to_cpu(ents[leafhdr.count - 1].hashval); + } else { + struct xfs_da3_icnode_hdr deadhdr; + + dead_node = (xfs_da_intnode_t *)dead_info; + dp->d_ops->node_hdr_from_disk(&deadhdr, dead_node); + btree = dp->d_ops->node_tree_p(dead_node); + dead_level = deadhdr.level; + dead_hash = be32_to_cpu(btree[deadhdr.count - 1].hashval); + } + sib_buf = par_buf = NULL; + /* + * If the moved block has a left sibling, fix up the pointers. + */ + if ((sib_blkno = be32_to_cpu(dead_info->back))) { + error = xfs_da3_node_read(tp, dp, sib_blkno, -1, &sib_buf, w); + if (error) + goto done; + sib_info = sib_buf->b_addr; + if (unlikely( + be32_to_cpu(sib_info->forw) != last_blkno || + sib_info->magic != dead_info->magic)) { + XFS_ERROR_REPORT("xfs_da_swap_lastblock(2)", + XFS_ERRLEVEL_LOW, mp); + error = -EFSCORRUPTED; + goto done; + } + sib_info->forw = cpu_to_be32(dead_blkno); + xfs_trans_log_buf(tp, sib_buf, + XFS_DA_LOGRANGE(sib_info, &sib_info->forw, + sizeof(sib_info->forw))); + sib_buf = NULL; + } + /* + * If the moved block has a right sibling, fix up the pointers. + */ + if ((sib_blkno = be32_to_cpu(dead_info->forw))) { + error = xfs_da3_node_read(tp, dp, sib_blkno, -1, &sib_buf, w); + if (error) + goto done; + sib_info = sib_buf->b_addr; + if (unlikely( + be32_to_cpu(sib_info->back) != last_blkno || + sib_info->magic != dead_info->magic)) { + XFS_ERROR_REPORT("xfs_da_swap_lastblock(3)", + XFS_ERRLEVEL_LOW, mp); + error = -EFSCORRUPTED; + goto done; + } + sib_info->back = cpu_to_be32(dead_blkno); + xfs_trans_log_buf(tp, sib_buf, + XFS_DA_LOGRANGE(sib_info, &sib_info->back, + sizeof(sib_info->back))); + sib_buf = NULL; + } + par_blkno = args->geo->leafblk; + level = -1; + /* + * Walk down the tree looking for the parent of the moved block. + */ + for (;;) { + error = xfs_da3_node_read(tp, dp, par_blkno, -1, &par_buf, w); + if (error) + goto done; + par_node = par_buf->b_addr; + dp->d_ops->node_hdr_from_disk(&par_hdr, par_node); + if (level >= 0 && level != par_hdr.level + 1) { + XFS_ERROR_REPORT("xfs_da_swap_lastblock(4)", + XFS_ERRLEVEL_LOW, mp); + error = -EFSCORRUPTED; + goto done; + } + level = par_hdr.level; + btree = dp->d_ops->node_tree_p(par_node); + for (entno = 0; + entno < par_hdr.count && + be32_to_cpu(btree[entno].hashval) < dead_hash; + entno++) + continue; + if (entno == par_hdr.count) { + XFS_ERROR_REPORT("xfs_da_swap_lastblock(5)", + XFS_ERRLEVEL_LOW, mp); + error = -EFSCORRUPTED; + goto done; + } + par_blkno = be32_to_cpu(btree[entno].before); + if (level == dead_level + 1) + break; + xfs_trans_brelse(tp, par_buf); + par_buf = NULL; + } + /* + * We're in the right parent block. + * Look for the right entry. + */ + for (;;) { + for (; + entno < par_hdr.count && + be32_to_cpu(btree[entno].before) != last_blkno; + entno++) + continue; + if (entno < par_hdr.count) + break; + par_blkno = par_hdr.forw; + xfs_trans_brelse(tp, par_buf); + par_buf = NULL; + if (unlikely(par_blkno == 0)) { + XFS_ERROR_REPORT("xfs_da_swap_lastblock(6)", + XFS_ERRLEVEL_LOW, mp); + error = -EFSCORRUPTED; + goto done; + } + error = xfs_da3_node_read(tp, dp, par_blkno, -1, &par_buf, w); + if (error) + goto done; + par_node = par_buf->b_addr; + dp->d_ops->node_hdr_from_disk(&par_hdr, par_node); + if (par_hdr.level != level) { + XFS_ERROR_REPORT("xfs_da_swap_lastblock(7)", + XFS_ERRLEVEL_LOW, mp); + error = -EFSCORRUPTED; + goto done; + } + btree = dp->d_ops->node_tree_p(par_node); + entno = 0; + } + /* + * Update the parent entry pointing to the moved block. + */ + btree[entno].before = cpu_to_be32(dead_blkno); + xfs_trans_log_buf(tp, par_buf, + XFS_DA_LOGRANGE(par_node, &btree[entno].before, + sizeof(btree[entno].before))); + *dead_blknop = last_blkno; + *dead_bufp = last_buf; + return 0; +done: + if (par_buf) + xfs_trans_brelse(tp, par_buf); + if (sib_buf) + xfs_trans_brelse(tp, sib_buf); + xfs_trans_brelse(tp, last_buf); + return error; +} + +/* + * Remove a btree block from a directory or attribute. + */ +int +xfs_da_shrink_inode( + xfs_da_args_t *args, + xfs_dablk_t dead_blkno, + struct xfs_buf *dead_buf) +{ + xfs_inode_t *dp; + int done, error, w, count; + xfs_trans_t *tp; + + trace_xfs_da_shrink_inode(args); + + dp = args->dp; + w = args->whichfork; + tp = args->trans; + count = args->geo->fsbcount; + for (;;) { + /* + * Remove extents. If we get ENOSPC for a dir we have to move + * the last block to the place we want to kill. + */ + error = xfs_bunmapi(tp, dp, dead_blkno, count, + xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA, + 0, args->firstblock, args->flist, &done); + if (error == -ENOSPC) { + if (w != XFS_DATA_FORK) + break; + error = xfs_da3_swap_lastblock(args, &dead_blkno, + &dead_buf); + if (error) + break; + } else { + break; + } + } + xfs_trans_binval(tp, dead_buf); + return error; +} + +/* + * See if the mapping(s) for this btree block are valid, i.e. + * don't contain holes, are logically contiguous, and cover the whole range. + */ +STATIC int +xfs_da_map_covers_blocks( + int nmap, + xfs_bmbt_irec_t *mapp, + xfs_dablk_t bno, + int count) +{ + int i; + xfs_fileoff_t off; + + for (i = 0, off = bno; i < nmap; i++) { + if (mapp[i].br_startblock == HOLESTARTBLOCK || + mapp[i].br_startblock == DELAYSTARTBLOCK) { + return 0; + } + if (off != mapp[i].br_startoff) { + return 0; + } + off += mapp[i].br_blockcount; + } + return off == bno + count; +} + +/* + * Convert a struct xfs_bmbt_irec to a struct xfs_buf_map. + * + * For the single map case, it is assumed that the caller has provided a pointer + * to a valid xfs_buf_map. For the multiple map case, this function will + * allocate the xfs_buf_map to hold all the maps and replace the caller's single + * map pointer with the allocated map. + */ +static int +xfs_buf_map_from_irec( + struct xfs_mount *mp, + struct xfs_buf_map **mapp, + int *nmaps, + struct xfs_bmbt_irec *irecs, + int nirecs) +{ + struct xfs_buf_map *map; + int i; + + ASSERT(*nmaps == 1); + ASSERT(nirecs >= 1); + + if (nirecs > 1) { + map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map), + KM_SLEEP | KM_NOFS); + if (!map) + return -ENOMEM; + *mapp = map; + } + + *nmaps = nirecs; + map = *mapp; + for (i = 0; i < *nmaps; i++) { + ASSERT(irecs[i].br_startblock != DELAYSTARTBLOCK && + irecs[i].br_startblock != HOLESTARTBLOCK); + map[i].bm_bn = XFS_FSB_TO_DADDR(mp, irecs[i].br_startblock); + map[i].bm_len = XFS_FSB_TO_BB(mp, irecs[i].br_blockcount); + } + return 0; +} + +/* + * Map the block we are given ready for reading. There are three possible return + * values: + * -1 - will be returned if we land in a hole and mappedbno == -2 so the + * caller knows not to execute a subsequent read. + * 0 - if we mapped the block successfully + * >0 - positive error number if there was an error. + */ +static int +xfs_dabuf_map( + struct xfs_inode *dp, + xfs_dablk_t bno, + xfs_daddr_t mappedbno, + int whichfork, + struct xfs_buf_map **map, + int *nmaps) +{ + struct xfs_mount *mp = dp->i_mount; + int nfsb; + int error = 0; + struct xfs_bmbt_irec irec; + struct xfs_bmbt_irec *irecs = &irec; + int nirecs; + + ASSERT(map && *map); + ASSERT(*nmaps == 1); + + if (whichfork == XFS_DATA_FORK) + nfsb = mp->m_dir_geo->fsbcount; + else + nfsb = mp->m_attr_geo->fsbcount; + + /* + * Caller doesn't have a mapping. -2 means don't complain + * if we land in a hole. + */ + if (mappedbno == -1 || mappedbno == -2) { + /* + * Optimize the one-block case. + */ + if (nfsb != 1) + irecs = kmem_zalloc(sizeof(irec) * nfsb, + KM_SLEEP | KM_NOFS); + + nirecs = nfsb; + error = xfs_bmapi_read(dp, (xfs_fileoff_t)bno, nfsb, irecs, + &nirecs, xfs_bmapi_aflag(whichfork)); + if (error) + goto out; + } else { + irecs->br_startblock = XFS_DADDR_TO_FSB(mp, mappedbno); + irecs->br_startoff = (xfs_fileoff_t)bno; + irecs->br_blockcount = nfsb; + irecs->br_state = 0; + nirecs = 1; + } + + if (!xfs_da_map_covers_blocks(nirecs, irecs, bno, nfsb)) { + error = mappedbno == -2 ? -1 : -EFSCORRUPTED; + if (unlikely(error == -EFSCORRUPTED)) { + if (xfs_error_level >= XFS_ERRLEVEL_LOW) { + int i; + xfs_alert(mp, "%s: bno %lld dir: inode %lld", + __func__, (long long)bno, + (long long)dp->i_ino); + for (i = 0; i < *nmaps; i++) { + xfs_alert(mp, +"[%02d] br_startoff %lld br_startblock %lld br_blockcount %lld br_state %d", + i, + (long long)irecs[i].br_startoff, + (long long)irecs[i].br_startblock, + (long long)irecs[i].br_blockcount, + irecs[i].br_state); + } + } + XFS_ERROR_REPORT("xfs_da_do_buf(1)", + XFS_ERRLEVEL_LOW, mp); + } + goto out; + } + error = xfs_buf_map_from_irec(mp, map, nmaps, irecs, nirecs); +out: + if (irecs != &irec) + kmem_free(irecs); + return error; +} + +/* + * Get a buffer for the dir/attr block. + */ +int +xfs_da_get_buf( + struct xfs_trans *trans, + struct xfs_inode *dp, + xfs_dablk_t bno, + xfs_daddr_t mappedbno, + struct xfs_buf **bpp, + int whichfork) +{ + struct xfs_buf *bp; + struct xfs_buf_map map; + struct xfs_buf_map *mapp; + int nmap; + int error; + + *bpp = NULL; + mapp = ↦ + nmap = 1; + error = xfs_dabuf_map(dp, bno, mappedbno, whichfork, + &mapp, &nmap); + if (error) { + /* mapping a hole is not an error, but we don't continue */ + if (error == -1) + error = 0; + goto out_free; + } + + bp = xfs_trans_get_buf_map(trans, dp->i_mount->m_ddev_targp, + mapp, nmap, 0); + error = bp ? bp->b_error : -EIO; + if (error) { + if (bp) + xfs_trans_brelse(trans, bp); + goto out_free; + } + + *bpp = bp; + +out_free: + if (mapp != &map) + kmem_free(mapp); + + return error; +} + +/* + * Get a buffer for the dir/attr block, fill in the contents. + */ +int +xfs_da_read_buf( + struct xfs_trans *trans, + struct xfs_inode *dp, + xfs_dablk_t bno, + xfs_daddr_t mappedbno, + struct xfs_buf **bpp, + int whichfork, + const struct xfs_buf_ops *ops) +{ + struct xfs_buf *bp; + struct xfs_buf_map map; + struct xfs_buf_map *mapp; + int nmap; + int error; + + *bpp = NULL; + mapp = ↦ + nmap = 1; + error = xfs_dabuf_map(dp, bno, mappedbno, whichfork, + &mapp, &nmap); + if (error) { + /* mapping a hole is not an error, but we don't continue */ + if (error == -1) + error = 0; + goto out_free; + } + + error = xfs_trans_read_buf_map(dp->i_mount, trans, + dp->i_mount->m_ddev_targp, + mapp, nmap, 0, &bp, ops); + if (error) + goto out_free; + + if (whichfork == XFS_ATTR_FORK) + xfs_buf_set_ref(bp, XFS_ATTR_BTREE_REF); + else + xfs_buf_set_ref(bp, XFS_DIR_BTREE_REF); + *bpp = bp; +out_free: + if (mapp != &map) + kmem_free(mapp); + + return error; +} + +/* + * Readahead the dir/attr block. + */ +xfs_daddr_t +xfs_da_reada_buf( + struct xfs_inode *dp, + xfs_dablk_t bno, + xfs_daddr_t mappedbno, + int whichfork, + const struct xfs_buf_ops *ops) +{ + struct xfs_buf_map map; + struct xfs_buf_map *mapp; + int nmap; + int error; + + mapp = ↦ + nmap = 1; + error = xfs_dabuf_map(dp, bno, mappedbno, whichfork, + &mapp, &nmap); + if (error) { + /* mapping a hole is not an error, but we don't continue */ + if (error == -1) + error = 0; + goto out_free; + } + + mappedbno = mapp[0].bm_bn; + xfs_buf_readahead_map(dp->i_mount->m_ddev_targp, mapp, nmap, ops); + +out_free: + if (mapp != &map) + kmem_free(mapp); + + if (error) + return -1; + return mappedbno; +} diff --git a/kernel/fs/xfs/libxfs/xfs_da_btree.h b/kernel/fs/xfs/libxfs/xfs_da_btree.h new file mode 100644 index 000000000..6e153e399 --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_da_btree.h @@ -0,0 +1,221 @@ +/* + * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_DA_BTREE_H__ +#define __XFS_DA_BTREE_H__ + +struct xfs_bmap_free; +struct xfs_inode; +struct xfs_trans; +struct zone; +struct xfs_dir_ops; + +/* + * Directory/attribute geometry information. There will be one of these for each + * data fork type, and it will be passed around via the xfs_da_args. Global + * structures will be attached to the xfs_mount. + */ +struct xfs_da_geometry { + int blksize; /* da block size in bytes */ + int fsbcount; /* da block size in filesystem blocks */ + uint8_t fsblog; /* log2 of _filesystem_ block size */ + uint8_t blklog; /* log2 of da block size */ + uint node_ents; /* # of entries in a danode */ + int magicpct; /* 37% of block size in bytes */ + xfs_dablk_t datablk; /* blockno of dir data v2 */ + xfs_dablk_t leafblk; /* blockno of leaf data v2 */ + xfs_dablk_t freeblk; /* blockno of free data v2 */ +}; + +/*======================================================================== + * Btree searching and modification structure definitions. + *========================================================================*/ + +/* + * Search comparison results + */ +enum xfs_dacmp { + XFS_CMP_DIFFERENT, /* names are completely different */ + XFS_CMP_EXACT, /* names are exactly the same */ + XFS_CMP_CASE /* names are same but differ in case */ +}; + +/* + * Structure to ease passing around component names. + */ +typedef struct xfs_da_args { + struct xfs_da_geometry *geo; /* da block geometry */ + const __uint8_t *name; /* string (maybe not NULL terminated) */ + int namelen; /* length of string (maybe no NULL) */ + __uint8_t filetype; /* filetype of inode for directories */ + __uint8_t *value; /* set of bytes (maybe contain NULLs) */ + int valuelen; /* length of value */ + int flags; /* argument flags (eg: ATTR_NOCREATE) */ + xfs_dahash_t hashval; /* hash value of name */ + xfs_ino_t inumber; /* input/output inode number */ + struct xfs_inode *dp; /* directory inode to manipulate */ + xfs_fsblock_t *firstblock; /* ptr to firstblock for bmap calls */ + struct xfs_bmap_free *flist; /* ptr to freelist for bmap_finish */ + struct xfs_trans *trans; /* current trans (changes over time) */ + xfs_extlen_t total; /* total blocks needed, for 1st bmap */ + int whichfork; /* data or attribute fork */ + xfs_dablk_t blkno; /* blkno of attr leaf of interest */ + int index; /* index of attr of interest in blk */ + xfs_dablk_t rmtblkno; /* remote attr value starting blkno */ + int rmtblkcnt; /* remote attr value block count */ + int rmtvaluelen; /* remote attr value length in bytes */ + xfs_dablk_t blkno2; /* blkno of 2nd attr leaf of interest */ + int index2; /* index of 2nd attr in blk */ + xfs_dablk_t rmtblkno2; /* remote attr value starting blkno */ + int rmtblkcnt2; /* remote attr value block count */ + int rmtvaluelen2; /* remote attr value length in bytes */ + int op_flags; /* operation flags */ + enum xfs_dacmp cmpresult; /* name compare result for lookups */ +} xfs_da_args_t; + +/* + * Operation flags: + */ +#define XFS_DA_OP_JUSTCHECK 0x0001 /* check for ok with no space */ +#define XFS_DA_OP_RENAME 0x0002 /* this is an atomic rename op */ +#define XFS_DA_OP_ADDNAME 0x0004 /* this is an add operation */ +#define XFS_DA_OP_OKNOENT 0x0008 /* lookup/add op, ENOENT ok, else die */ +#define XFS_DA_OP_CILOOKUP 0x0010 /* lookup to return CI name if found */ + +#define XFS_DA_OP_FLAGS \ + { XFS_DA_OP_JUSTCHECK, "JUSTCHECK" }, \ + { XFS_DA_OP_RENAME, "RENAME" }, \ + { XFS_DA_OP_ADDNAME, "ADDNAME" }, \ + { XFS_DA_OP_OKNOENT, "OKNOENT" }, \ + { XFS_DA_OP_CILOOKUP, "CILOOKUP" } + +/* + * Storage for holding state during Btree searches and split/join ops. + * + * Only need space for 5 intermediate nodes. With a minimum of 62-way + * fanout to the Btree, we can support over 900 million directory blocks, + * which is slightly more than enough. + */ +typedef struct xfs_da_state_blk { + struct xfs_buf *bp; /* buffer containing block */ + xfs_dablk_t blkno; /* filesystem blkno of buffer */ + xfs_daddr_t disk_blkno; /* on-disk blkno (in BBs) of buffer */ + int index; /* relevant index into block */ + xfs_dahash_t hashval; /* last hash value in block */ + int magic; /* blk's magic number, ie: blk type */ +} xfs_da_state_blk_t; + +typedef struct xfs_da_state_path { + int active; /* number of active levels */ + xfs_da_state_blk_t blk[XFS_DA_NODE_MAXDEPTH]; +} xfs_da_state_path_t; + +typedef struct xfs_da_state { + xfs_da_args_t *args; /* filename arguments */ + struct xfs_mount *mp; /* filesystem mount point */ + xfs_da_state_path_t path; /* search/split paths */ + xfs_da_state_path_t altpath; /* alternate path for join */ + unsigned char inleaf; /* insert into 1->lf, 0->splf */ + unsigned char extravalid; /* T/F: extrablk is in use */ + unsigned char extraafter; /* T/F: extrablk is after new */ + xfs_da_state_blk_t extrablk; /* for double-splits on leaves */ + /* for dirv2 extrablk is data */ +} xfs_da_state_t; + +/* + * Utility macros to aid in logging changed structure fields. + */ +#define XFS_DA_LOGOFF(BASE, ADDR) ((char *)(ADDR) - (char *)(BASE)) +#define XFS_DA_LOGRANGE(BASE, ADDR, SIZE) \ + (uint)(XFS_DA_LOGOFF(BASE, ADDR)), \ + (uint)(XFS_DA_LOGOFF(BASE, ADDR)+(SIZE)-1) + +/* + * Name ops for directory and/or attr name operations + */ +struct xfs_nameops { + xfs_dahash_t (*hashname)(struct xfs_name *); + enum xfs_dacmp (*compname)(struct xfs_da_args *, + const unsigned char *, int); +}; + + +/*======================================================================== + * Function prototypes. + *========================================================================*/ + +/* + * Routines used for growing the Btree. + */ +int xfs_da3_node_create(struct xfs_da_args *args, xfs_dablk_t blkno, + int level, struct xfs_buf **bpp, int whichfork); +int xfs_da3_split(xfs_da_state_t *state); + +/* + * Routines used for shrinking the Btree. + */ +int xfs_da3_join(xfs_da_state_t *state); +void xfs_da3_fixhashpath(struct xfs_da_state *state, + struct xfs_da_state_path *path_to_to_fix); + +/* + * Routines used for finding things in the Btree. + */ +int xfs_da3_node_lookup_int(xfs_da_state_t *state, int *result); +int xfs_da3_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path, + int forward, int release, int *result); +/* + * Utility routines. + */ +int xfs_da3_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, + xfs_da_state_blk_t *new_blk); +int xfs_da3_node_read(struct xfs_trans *tp, struct xfs_inode *dp, + xfs_dablk_t bno, xfs_daddr_t mappedbno, + struct xfs_buf **bpp, int which_fork); + +/* + * Utility routines. + */ +int xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno); +int xfs_da_grow_inode_int(struct xfs_da_args *args, xfs_fileoff_t *bno, + int count); +int xfs_da_get_buf(struct xfs_trans *trans, struct xfs_inode *dp, + xfs_dablk_t bno, xfs_daddr_t mappedbno, + struct xfs_buf **bp, int whichfork); +int xfs_da_read_buf(struct xfs_trans *trans, struct xfs_inode *dp, + xfs_dablk_t bno, xfs_daddr_t mappedbno, + struct xfs_buf **bpp, int whichfork, + const struct xfs_buf_ops *ops); +xfs_daddr_t xfs_da_reada_buf(struct xfs_inode *dp, xfs_dablk_t bno, + xfs_daddr_t mapped_bno, int whichfork, + const struct xfs_buf_ops *ops); +int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno, + struct xfs_buf *dead_buf); + +uint xfs_da_hashname(const __uint8_t *name_string, int name_length); +enum xfs_dacmp xfs_da_compname(struct xfs_da_args *args, + const unsigned char *name, int len); + + +xfs_da_state_t *xfs_da_state_alloc(void); +void xfs_da_state_free(xfs_da_state_t *state); + +extern struct kmem_zone *xfs_da_state_zone; +extern const struct xfs_nameops xfs_default_nameops; + +#endif /* __XFS_DA_BTREE_H__ */ diff --git a/kernel/fs/xfs/libxfs/xfs_da_format.c b/kernel/fs/xfs/libxfs/xfs_da_format.c new file mode 100644 index 000000000..9d624a622 --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_da_format.c @@ -0,0 +1,908 @@ +/* + * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_inode.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" + +/* + * Shortform directory ops + */ +static int +xfs_dir2_sf_entsize( + struct xfs_dir2_sf_hdr *hdr, + int len) +{ + int count = sizeof(struct xfs_dir2_sf_entry); /* namelen + offset */ + + count += len; /* name */ + count += hdr->i8count ? sizeof(xfs_dir2_ino8_t) : + sizeof(xfs_dir2_ino4_t); /* ino # */ + return count; +} + +static int +xfs_dir3_sf_entsize( + struct xfs_dir2_sf_hdr *hdr, + int len) +{ + return xfs_dir2_sf_entsize(hdr, len) + sizeof(__uint8_t); +} + +static struct xfs_dir2_sf_entry * +xfs_dir2_sf_nextentry( + struct xfs_dir2_sf_hdr *hdr, + struct xfs_dir2_sf_entry *sfep) +{ + return (struct xfs_dir2_sf_entry *) + ((char *)sfep + xfs_dir2_sf_entsize(hdr, sfep->namelen)); +} + +static struct xfs_dir2_sf_entry * +xfs_dir3_sf_nextentry( + struct xfs_dir2_sf_hdr *hdr, + struct xfs_dir2_sf_entry *sfep) +{ + return (struct xfs_dir2_sf_entry *) + ((char *)sfep + xfs_dir3_sf_entsize(hdr, sfep->namelen)); +} + + +/* + * For filetype enabled shortform directories, the file type field is stored at + * the end of the name. Because it's only a single byte, endian conversion is + * not necessary. For non-filetype enable directories, the type is always + * unknown and we never store the value. + */ +static __uint8_t +xfs_dir2_sfe_get_ftype( + struct xfs_dir2_sf_entry *sfep) +{ + return XFS_DIR3_FT_UNKNOWN; +} + +static void +xfs_dir2_sfe_put_ftype( + struct xfs_dir2_sf_entry *sfep, + __uint8_t ftype) +{ + ASSERT(ftype < XFS_DIR3_FT_MAX); +} + +static __uint8_t +xfs_dir3_sfe_get_ftype( + struct xfs_dir2_sf_entry *sfep) +{ + __uint8_t ftype; + + ftype = sfep->name[sfep->namelen]; + if (ftype >= XFS_DIR3_FT_MAX) + return XFS_DIR3_FT_UNKNOWN; + return ftype; +} + +static void +xfs_dir3_sfe_put_ftype( + struct xfs_dir2_sf_entry *sfep, + __uint8_t ftype) +{ + ASSERT(ftype < XFS_DIR3_FT_MAX); + + sfep->name[sfep->namelen] = ftype; +} + +/* + * Inode numbers in short-form directories can come in two versions, + * either 4 bytes or 8 bytes wide. These helpers deal with the + * two forms transparently by looking at the headers i8count field. + * + * For 64-bit inode number the most significant byte must be zero. + */ +static xfs_ino_t +xfs_dir2_sf_get_ino( + struct xfs_dir2_sf_hdr *hdr, + xfs_dir2_inou_t *from) +{ + if (hdr->i8count) + return get_unaligned_be64(&from->i8.i) & 0x00ffffffffffffffULL; + else + return get_unaligned_be32(&from->i4.i); +} + +static void +xfs_dir2_sf_put_ino( + struct xfs_dir2_sf_hdr *hdr, + xfs_dir2_inou_t *to, + xfs_ino_t ino) +{ + ASSERT((ino & 0xff00000000000000ULL) == 0); + + if (hdr->i8count) + put_unaligned_be64(ino, &to->i8.i); + else + put_unaligned_be32(ino, &to->i4.i); +} + +static xfs_ino_t +xfs_dir2_sf_get_parent_ino( + struct xfs_dir2_sf_hdr *hdr) +{ + return xfs_dir2_sf_get_ino(hdr, &hdr->parent); +} + +static void +xfs_dir2_sf_put_parent_ino( + struct xfs_dir2_sf_hdr *hdr, + xfs_ino_t ino) +{ + xfs_dir2_sf_put_ino(hdr, &hdr->parent, ino); +} + +/* + * In short-form directory entries the inode numbers are stored at variable + * offset behind the entry name. If the entry stores a filetype value, then it + * sits between the name and the inode number. Hence the inode numbers may only + * be accessed through the helpers below. + */ +static xfs_ino_t +xfs_dir2_sfe_get_ino( + struct xfs_dir2_sf_hdr *hdr, + struct xfs_dir2_sf_entry *sfep) +{ + return xfs_dir2_sf_get_ino(hdr, + (xfs_dir2_inou_t *)&sfep->name[sfep->namelen]); +} + +static void +xfs_dir2_sfe_put_ino( + struct xfs_dir2_sf_hdr *hdr, + struct xfs_dir2_sf_entry *sfep, + xfs_ino_t ino) +{ + xfs_dir2_sf_put_ino(hdr, + (xfs_dir2_inou_t *)&sfep->name[sfep->namelen], ino); +} + +static xfs_ino_t +xfs_dir3_sfe_get_ino( + struct xfs_dir2_sf_hdr *hdr, + struct xfs_dir2_sf_entry *sfep) +{ + return xfs_dir2_sf_get_ino(hdr, + (xfs_dir2_inou_t *)&sfep->name[sfep->namelen + 1]); +} + +static void +xfs_dir3_sfe_put_ino( + struct xfs_dir2_sf_hdr *hdr, + struct xfs_dir2_sf_entry *sfep, + xfs_ino_t ino) +{ + xfs_dir2_sf_put_ino(hdr, + (xfs_dir2_inou_t *)&sfep->name[sfep->namelen + 1], ino); +} + + +/* + * Directory data block operations + */ + +/* + * For special situations, the dirent size ends up fixed because we always know + * what the size of the entry is. That's true for the "." and "..", and + * therefore we know that they are a fixed size and hence their offsets are + * constant, as is the first entry. + * + * Hence, this calculation is written as a macro to be able to be calculated at + * compile time and so certain offsets can be calculated directly in the + * structure initaliser via the macro. There are two macros - one for dirents + * with ftype and without so there are no unresolvable conditionals in the + * calculations. We also use round_up() as XFS_DIR2_DATA_ALIGN is always a power + * of 2 and the compiler doesn't reject it (unlike roundup()). + */ +#define XFS_DIR2_DATA_ENTSIZE(n) \ + round_up((offsetof(struct xfs_dir2_data_entry, name[0]) + (n) + \ + sizeof(xfs_dir2_data_off_t)), XFS_DIR2_DATA_ALIGN) + +#define XFS_DIR3_DATA_ENTSIZE(n) \ + round_up((offsetof(struct xfs_dir2_data_entry, name[0]) + (n) + \ + sizeof(xfs_dir2_data_off_t) + sizeof(__uint8_t)), \ + XFS_DIR2_DATA_ALIGN) + +static int +xfs_dir2_data_entsize( + int n) +{ + return XFS_DIR2_DATA_ENTSIZE(n); +} + +static int +xfs_dir3_data_entsize( + int n) +{ + return XFS_DIR3_DATA_ENTSIZE(n); +} + +static __uint8_t +xfs_dir2_data_get_ftype( + struct xfs_dir2_data_entry *dep) +{ + return XFS_DIR3_FT_UNKNOWN; +} + +static void +xfs_dir2_data_put_ftype( + struct xfs_dir2_data_entry *dep, + __uint8_t ftype) +{ + ASSERT(ftype < XFS_DIR3_FT_MAX); +} + +static __uint8_t +xfs_dir3_data_get_ftype( + struct xfs_dir2_data_entry *dep) +{ + __uint8_t ftype = dep->name[dep->namelen]; + + if (ftype >= XFS_DIR3_FT_MAX) + return XFS_DIR3_FT_UNKNOWN; + return ftype; +} + +static void +xfs_dir3_data_put_ftype( + struct xfs_dir2_data_entry *dep, + __uint8_t type) +{ + ASSERT(type < XFS_DIR3_FT_MAX); + ASSERT(dep->namelen != 0); + + dep->name[dep->namelen] = type; +} + +/* + * Pointer to an entry's tag word. + */ +static __be16 * +xfs_dir2_data_entry_tag_p( + struct xfs_dir2_data_entry *dep) +{ + return (__be16 *)((char *)dep + + xfs_dir2_data_entsize(dep->namelen) - sizeof(__be16)); +} + +static __be16 * +xfs_dir3_data_entry_tag_p( + struct xfs_dir2_data_entry *dep) +{ + return (__be16 *)((char *)dep + + xfs_dir3_data_entsize(dep->namelen) - sizeof(__be16)); +} + +/* + * location of . and .. in data space (always block 0) + */ +static struct xfs_dir2_data_entry * +xfs_dir2_data_dot_entry_p( + struct xfs_dir2_data_hdr *hdr) +{ + return (struct xfs_dir2_data_entry *) + ((char *)hdr + sizeof(struct xfs_dir2_data_hdr)); +} + +static struct xfs_dir2_data_entry * +xfs_dir2_data_dotdot_entry_p( + struct xfs_dir2_data_hdr *hdr) +{ + return (struct xfs_dir2_data_entry *) + ((char *)hdr + sizeof(struct xfs_dir2_data_hdr) + + XFS_DIR2_DATA_ENTSIZE(1)); +} + +static struct xfs_dir2_data_entry * +xfs_dir2_data_first_entry_p( + struct xfs_dir2_data_hdr *hdr) +{ + return (struct xfs_dir2_data_entry *) + ((char *)hdr + sizeof(struct xfs_dir2_data_hdr) + + XFS_DIR2_DATA_ENTSIZE(1) + + XFS_DIR2_DATA_ENTSIZE(2)); +} + +static struct xfs_dir2_data_entry * +xfs_dir2_ftype_data_dotdot_entry_p( + struct xfs_dir2_data_hdr *hdr) +{ + return (struct xfs_dir2_data_entry *) + ((char *)hdr + sizeof(struct xfs_dir2_data_hdr) + + XFS_DIR3_DATA_ENTSIZE(1)); +} + +static struct xfs_dir2_data_entry * +xfs_dir2_ftype_data_first_entry_p( + struct xfs_dir2_data_hdr *hdr) +{ + return (struct xfs_dir2_data_entry *) + ((char *)hdr + sizeof(struct xfs_dir2_data_hdr) + + XFS_DIR3_DATA_ENTSIZE(1) + + XFS_DIR3_DATA_ENTSIZE(2)); +} + +static struct xfs_dir2_data_entry * +xfs_dir3_data_dot_entry_p( + struct xfs_dir2_data_hdr *hdr) +{ + return (struct xfs_dir2_data_entry *) + ((char *)hdr + sizeof(struct xfs_dir3_data_hdr)); +} + +static struct xfs_dir2_data_entry * +xfs_dir3_data_dotdot_entry_p( + struct xfs_dir2_data_hdr *hdr) +{ + return (struct xfs_dir2_data_entry *) + ((char *)hdr + sizeof(struct xfs_dir3_data_hdr) + + XFS_DIR3_DATA_ENTSIZE(1)); +} + +static struct xfs_dir2_data_entry * +xfs_dir3_data_first_entry_p( + struct xfs_dir2_data_hdr *hdr) +{ + return (struct xfs_dir2_data_entry *) + ((char *)hdr + sizeof(struct xfs_dir3_data_hdr) + + XFS_DIR3_DATA_ENTSIZE(1) + + XFS_DIR3_DATA_ENTSIZE(2)); +} + +static struct xfs_dir2_data_free * +xfs_dir2_data_bestfree_p(struct xfs_dir2_data_hdr *hdr) +{ + return hdr->bestfree; +} + +static struct xfs_dir2_data_free * +xfs_dir3_data_bestfree_p(struct xfs_dir2_data_hdr *hdr) +{ + return ((struct xfs_dir3_data_hdr *)hdr)->best_free; +} + +static struct xfs_dir2_data_entry * +xfs_dir2_data_entry_p(struct xfs_dir2_data_hdr *hdr) +{ + return (struct xfs_dir2_data_entry *) + ((char *)hdr + sizeof(struct xfs_dir2_data_hdr)); +} + +static struct xfs_dir2_data_unused * +xfs_dir2_data_unused_p(struct xfs_dir2_data_hdr *hdr) +{ + return (struct xfs_dir2_data_unused *) + ((char *)hdr + sizeof(struct xfs_dir2_data_hdr)); +} + +static struct xfs_dir2_data_entry * +xfs_dir3_data_entry_p(struct xfs_dir2_data_hdr *hdr) +{ + return (struct xfs_dir2_data_entry *) + ((char *)hdr + sizeof(struct xfs_dir3_data_hdr)); +} + +static struct xfs_dir2_data_unused * +xfs_dir3_data_unused_p(struct xfs_dir2_data_hdr *hdr) +{ + return (struct xfs_dir2_data_unused *) + ((char *)hdr + sizeof(struct xfs_dir3_data_hdr)); +} + + +/* + * Directory Leaf block operations + */ +static int +xfs_dir2_max_leaf_ents(struct xfs_da_geometry *geo) +{ + return (geo->blksize - sizeof(struct xfs_dir2_leaf_hdr)) / + (uint)sizeof(struct xfs_dir2_leaf_entry); +} + +static struct xfs_dir2_leaf_entry * +xfs_dir2_leaf_ents_p(struct xfs_dir2_leaf *lp) +{ + return lp->__ents; +} + +static int +xfs_dir3_max_leaf_ents(struct xfs_da_geometry *geo) +{ + return (geo->blksize - sizeof(struct xfs_dir3_leaf_hdr)) / + (uint)sizeof(struct xfs_dir2_leaf_entry); +} + +static struct xfs_dir2_leaf_entry * +xfs_dir3_leaf_ents_p(struct xfs_dir2_leaf *lp) +{ + return ((struct xfs_dir3_leaf *)lp)->__ents; +} + +static void +xfs_dir2_leaf_hdr_from_disk( + struct xfs_dir3_icleaf_hdr *to, + struct xfs_dir2_leaf *from) +{ + to->forw = be32_to_cpu(from->hdr.info.forw); + to->back = be32_to_cpu(from->hdr.info.back); + to->magic = be16_to_cpu(from->hdr.info.magic); + to->count = be16_to_cpu(from->hdr.count); + to->stale = be16_to_cpu(from->hdr.stale); + + ASSERT(to->magic == XFS_DIR2_LEAF1_MAGIC || + to->magic == XFS_DIR2_LEAFN_MAGIC); +} + +static void +xfs_dir2_leaf_hdr_to_disk( + struct xfs_dir2_leaf *to, + struct xfs_dir3_icleaf_hdr *from) +{ + ASSERT(from->magic == XFS_DIR2_LEAF1_MAGIC || + from->magic == XFS_DIR2_LEAFN_MAGIC); + + to->hdr.info.forw = cpu_to_be32(from->forw); + to->hdr.info.back = cpu_to_be32(from->back); + to->hdr.info.magic = cpu_to_be16(from->magic); + to->hdr.count = cpu_to_be16(from->count); + to->hdr.stale = cpu_to_be16(from->stale); +} + +static void +xfs_dir3_leaf_hdr_from_disk( + struct xfs_dir3_icleaf_hdr *to, + struct xfs_dir2_leaf *from) +{ + struct xfs_dir3_leaf_hdr *hdr3 = (struct xfs_dir3_leaf_hdr *)from; + + to->forw = be32_to_cpu(hdr3->info.hdr.forw); + to->back = be32_to_cpu(hdr3->info.hdr.back); + to->magic = be16_to_cpu(hdr3->info.hdr.magic); + to->count = be16_to_cpu(hdr3->count); + to->stale = be16_to_cpu(hdr3->stale); + + ASSERT(to->magic == XFS_DIR3_LEAF1_MAGIC || + to->magic == XFS_DIR3_LEAFN_MAGIC); +} + +static void +xfs_dir3_leaf_hdr_to_disk( + struct xfs_dir2_leaf *to, + struct xfs_dir3_icleaf_hdr *from) +{ + struct xfs_dir3_leaf_hdr *hdr3 = (struct xfs_dir3_leaf_hdr *)to; + + ASSERT(from->magic == XFS_DIR3_LEAF1_MAGIC || + from->magic == XFS_DIR3_LEAFN_MAGIC); + + hdr3->info.hdr.forw = cpu_to_be32(from->forw); + hdr3->info.hdr.back = cpu_to_be32(from->back); + hdr3->info.hdr.magic = cpu_to_be16(from->magic); + hdr3->count = cpu_to_be16(from->count); + hdr3->stale = cpu_to_be16(from->stale); +} + + +/* + * Directory/Attribute Node block operations + */ +static struct xfs_da_node_entry * +xfs_da2_node_tree_p(struct xfs_da_intnode *dap) +{ + return dap->__btree; +} + +static struct xfs_da_node_entry * +xfs_da3_node_tree_p(struct xfs_da_intnode *dap) +{ + return ((struct xfs_da3_intnode *)dap)->__btree; +} + +static void +xfs_da2_node_hdr_from_disk( + struct xfs_da3_icnode_hdr *to, + struct xfs_da_intnode *from) +{ + ASSERT(from->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); + to->forw = be32_to_cpu(from->hdr.info.forw); + to->back = be32_to_cpu(from->hdr.info.back); + to->magic = be16_to_cpu(from->hdr.info.magic); + to->count = be16_to_cpu(from->hdr.__count); + to->level = be16_to_cpu(from->hdr.__level); +} + +static void +xfs_da2_node_hdr_to_disk( + struct xfs_da_intnode *to, + struct xfs_da3_icnode_hdr *from) +{ + ASSERT(from->magic == XFS_DA_NODE_MAGIC); + to->hdr.info.forw = cpu_to_be32(from->forw); + to->hdr.info.back = cpu_to_be32(from->back); + to->hdr.info.magic = cpu_to_be16(from->magic); + to->hdr.__count = cpu_to_be16(from->count); + to->hdr.__level = cpu_to_be16(from->level); +} + +static void +xfs_da3_node_hdr_from_disk( + struct xfs_da3_icnode_hdr *to, + struct xfs_da_intnode *from) +{ + struct xfs_da3_node_hdr *hdr3 = (struct xfs_da3_node_hdr *)from; + + ASSERT(from->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC)); + to->forw = be32_to_cpu(hdr3->info.hdr.forw); + to->back = be32_to_cpu(hdr3->info.hdr.back); + to->magic = be16_to_cpu(hdr3->info.hdr.magic); + to->count = be16_to_cpu(hdr3->__count); + to->level = be16_to_cpu(hdr3->__level); +} + +static void +xfs_da3_node_hdr_to_disk( + struct xfs_da_intnode *to, + struct xfs_da3_icnode_hdr *from) +{ + struct xfs_da3_node_hdr *hdr3 = (struct xfs_da3_node_hdr *)to; + + ASSERT(from->magic == XFS_DA3_NODE_MAGIC); + hdr3->info.hdr.forw = cpu_to_be32(from->forw); + hdr3->info.hdr.back = cpu_to_be32(from->back); + hdr3->info.hdr.magic = cpu_to_be16(from->magic); + hdr3->__count = cpu_to_be16(from->count); + hdr3->__level = cpu_to_be16(from->level); +} + + +/* + * Directory free space block operations + */ +static int +xfs_dir2_free_max_bests(struct xfs_da_geometry *geo) +{ + return (geo->blksize - sizeof(struct xfs_dir2_free_hdr)) / + sizeof(xfs_dir2_data_off_t); +} + +static __be16 * +xfs_dir2_free_bests_p(struct xfs_dir2_free *free) +{ + return (__be16 *)((char *)free + sizeof(struct xfs_dir2_free_hdr)); +} + +/* + * Convert data space db to the corresponding free db. + */ +static xfs_dir2_db_t +xfs_dir2_db_to_fdb(struct xfs_da_geometry *geo, xfs_dir2_db_t db) +{ + return xfs_dir2_byte_to_db(geo, XFS_DIR2_FREE_OFFSET) + + (db / xfs_dir2_free_max_bests(geo)); +} + +/* + * Convert data space db to the corresponding index in a free db. + */ +static int +xfs_dir2_db_to_fdindex(struct xfs_da_geometry *geo, xfs_dir2_db_t db) +{ + return db % xfs_dir2_free_max_bests(geo); +} + +static int +xfs_dir3_free_max_bests(struct xfs_da_geometry *geo) +{ + return (geo->blksize - sizeof(struct xfs_dir3_free_hdr)) / + sizeof(xfs_dir2_data_off_t); +} + +static __be16 * +xfs_dir3_free_bests_p(struct xfs_dir2_free *free) +{ + return (__be16 *)((char *)free + sizeof(struct xfs_dir3_free_hdr)); +} + +/* + * Convert data space db to the corresponding free db. + */ +static xfs_dir2_db_t +xfs_dir3_db_to_fdb(struct xfs_da_geometry *geo, xfs_dir2_db_t db) +{ + return xfs_dir2_byte_to_db(geo, XFS_DIR2_FREE_OFFSET) + + (db / xfs_dir3_free_max_bests(geo)); +} + +/* + * Convert data space db to the corresponding index in a free db. + */ +static int +xfs_dir3_db_to_fdindex(struct xfs_da_geometry *geo, xfs_dir2_db_t db) +{ + return db % xfs_dir3_free_max_bests(geo); +} + +static void +xfs_dir2_free_hdr_from_disk( + struct xfs_dir3_icfree_hdr *to, + struct xfs_dir2_free *from) +{ + to->magic = be32_to_cpu(from->hdr.magic); + to->firstdb = be32_to_cpu(from->hdr.firstdb); + to->nvalid = be32_to_cpu(from->hdr.nvalid); + to->nused = be32_to_cpu(from->hdr.nused); + ASSERT(to->magic == XFS_DIR2_FREE_MAGIC); +} + +static void +xfs_dir2_free_hdr_to_disk( + struct xfs_dir2_free *to, + struct xfs_dir3_icfree_hdr *from) +{ + ASSERT(from->magic == XFS_DIR2_FREE_MAGIC); + + to->hdr.magic = cpu_to_be32(from->magic); + to->hdr.firstdb = cpu_to_be32(from->firstdb); + to->hdr.nvalid = cpu_to_be32(from->nvalid); + to->hdr.nused = cpu_to_be32(from->nused); +} + +static void +xfs_dir3_free_hdr_from_disk( + struct xfs_dir3_icfree_hdr *to, + struct xfs_dir2_free *from) +{ + struct xfs_dir3_free_hdr *hdr3 = (struct xfs_dir3_free_hdr *)from; + + to->magic = be32_to_cpu(hdr3->hdr.magic); + to->firstdb = be32_to_cpu(hdr3->firstdb); + to->nvalid = be32_to_cpu(hdr3->nvalid); + to->nused = be32_to_cpu(hdr3->nused); + + ASSERT(to->magic == XFS_DIR3_FREE_MAGIC); +} + +static void +xfs_dir3_free_hdr_to_disk( + struct xfs_dir2_free *to, + struct xfs_dir3_icfree_hdr *from) +{ + struct xfs_dir3_free_hdr *hdr3 = (struct xfs_dir3_free_hdr *)to; + + ASSERT(from->magic == XFS_DIR3_FREE_MAGIC); + + hdr3->hdr.magic = cpu_to_be32(from->magic); + hdr3->firstdb = cpu_to_be32(from->firstdb); + hdr3->nvalid = cpu_to_be32(from->nvalid); + hdr3->nused = cpu_to_be32(from->nused); +} + +static const struct xfs_dir_ops xfs_dir2_ops = { + .sf_entsize = xfs_dir2_sf_entsize, + .sf_nextentry = xfs_dir2_sf_nextentry, + .sf_get_ftype = xfs_dir2_sfe_get_ftype, + .sf_put_ftype = xfs_dir2_sfe_put_ftype, + .sf_get_ino = xfs_dir2_sfe_get_ino, + .sf_put_ino = xfs_dir2_sfe_put_ino, + .sf_get_parent_ino = xfs_dir2_sf_get_parent_ino, + .sf_put_parent_ino = xfs_dir2_sf_put_parent_ino, + + .data_entsize = xfs_dir2_data_entsize, + .data_get_ftype = xfs_dir2_data_get_ftype, + .data_put_ftype = xfs_dir2_data_put_ftype, + .data_entry_tag_p = xfs_dir2_data_entry_tag_p, + .data_bestfree_p = xfs_dir2_data_bestfree_p, + + .data_dot_offset = sizeof(struct xfs_dir2_data_hdr), + .data_dotdot_offset = sizeof(struct xfs_dir2_data_hdr) + + XFS_DIR2_DATA_ENTSIZE(1), + .data_first_offset = sizeof(struct xfs_dir2_data_hdr) + + XFS_DIR2_DATA_ENTSIZE(1) + + XFS_DIR2_DATA_ENTSIZE(2), + .data_entry_offset = sizeof(struct xfs_dir2_data_hdr), + + .data_dot_entry_p = xfs_dir2_data_dot_entry_p, + .data_dotdot_entry_p = xfs_dir2_data_dotdot_entry_p, + .data_first_entry_p = xfs_dir2_data_first_entry_p, + .data_entry_p = xfs_dir2_data_entry_p, + .data_unused_p = xfs_dir2_data_unused_p, + + .leaf_hdr_size = sizeof(struct xfs_dir2_leaf_hdr), + .leaf_hdr_to_disk = xfs_dir2_leaf_hdr_to_disk, + .leaf_hdr_from_disk = xfs_dir2_leaf_hdr_from_disk, + .leaf_max_ents = xfs_dir2_max_leaf_ents, + .leaf_ents_p = xfs_dir2_leaf_ents_p, + + .node_hdr_size = sizeof(struct xfs_da_node_hdr), + .node_hdr_to_disk = xfs_da2_node_hdr_to_disk, + .node_hdr_from_disk = xfs_da2_node_hdr_from_disk, + .node_tree_p = xfs_da2_node_tree_p, + + .free_hdr_size = sizeof(struct xfs_dir2_free_hdr), + .free_hdr_to_disk = xfs_dir2_free_hdr_to_disk, + .free_hdr_from_disk = xfs_dir2_free_hdr_from_disk, + .free_max_bests = xfs_dir2_free_max_bests, + .free_bests_p = xfs_dir2_free_bests_p, + .db_to_fdb = xfs_dir2_db_to_fdb, + .db_to_fdindex = xfs_dir2_db_to_fdindex, +}; + +static const struct xfs_dir_ops xfs_dir2_ftype_ops = { + .sf_entsize = xfs_dir3_sf_entsize, + .sf_nextentry = xfs_dir3_sf_nextentry, + .sf_get_ftype = xfs_dir3_sfe_get_ftype, + .sf_put_ftype = xfs_dir3_sfe_put_ftype, + .sf_get_ino = xfs_dir3_sfe_get_ino, + .sf_put_ino = xfs_dir3_sfe_put_ino, + .sf_get_parent_ino = xfs_dir2_sf_get_parent_ino, + .sf_put_parent_ino = xfs_dir2_sf_put_parent_ino, + + .data_entsize = xfs_dir3_data_entsize, + .data_get_ftype = xfs_dir3_data_get_ftype, + .data_put_ftype = xfs_dir3_data_put_ftype, + .data_entry_tag_p = xfs_dir3_data_entry_tag_p, + .data_bestfree_p = xfs_dir2_data_bestfree_p, + + .data_dot_offset = sizeof(struct xfs_dir2_data_hdr), + .data_dotdot_offset = sizeof(struct xfs_dir2_data_hdr) + + XFS_DIR3_DATA_ENTSIZE(1), + .data_first_offset = sizeof(struct xfs_dir2_data_hdr) + + XFS_DIR3_DATA_ENTSIZE(1) + + XFS_DIR3_DATA_ENTSIZE(2), + .data_entry_offset = sizeof(struct xfs_dir2_data_hdr), + + .data_dot_entry_p = xfs_dir2_data_dot_entry_p, + .data_dotdot_entry_p = xfs_dir2_ftype_data_dotdot_entry_p, + .data_first_entry_p = xfs_dir2_ftype_data_first_entry_p, + .data_entry_p = xfs_dir2_data_entry_p, + .data_unused_p = xfs_dir2_data_unused_p, + + .leaf_hdr_size = sizeof(struct xfs_dir2_leaf_hdr), + .leaf_hdr_to_disk = xfs_dir2_leaf_hdr_to_disk, + .leaf_hdr_from_disk = xfs_dir2_leaf_hdr_from_disk, + .leaf_max_ents = xfs_dir2_max_leaf_ents, + .leaf_ents_p = xfs_dir2_leaf_ents_p, + + .node_hdr_size = sizeof(struct xfs_da_node_hdr), + .node_hdr_to_disk = xfs_da2_node_hdr_to_disk, + .node_hdr_from_disk = xfs_da2_node_hdr_from_disk, + .node_tree_p = xfs_da2_node_tree_p, + + .free_hdr_size = sizeof(struct xfs_dir2_free_hdr), + .free_hdr_to_disk = xfs_dir2_free_hdr_to_disk, + .free_hdr_from_disk = xfs_dir2_free_hdr_from_disk, + .free_max_bests = xfs_dir2_free_max_bests, + .free_bests_p = xfs_dir2_free_bests_p, + .db_to_fdb = xfs_dir2_db_to_fdb, + .db_to_fdindex = xfs_dir2_db_to_fdindex, +}; + +static const struct xfs_dir_ops xfs_dir3_ops = { + .sf_entsize = xfs_dir3_sf_entsize, + .sf_nextentry = xfs_dir3_sf_nextentry, + .sf_get_ftype = xfs_dir3_sfe_get_ftype, + .sf_put_ftype = xfs_dir3_sfe_put_ftype, + .sf_get_ino = xfs_dir3_sfe_get_ino, + .sf_put_ino = xfs_dir3_sfe_put_ino, + .sf_get_parent_ino = xfs_dir2_sf_get_parent_ino, + .sf_put_parent_ino = xfs_dir2_sf_put_parent_ino, + + .data_entsize = xfs_dir3_data_entsize, + .data_get_ftype = xfs_dir3_data_get_ftype, + .data_put_ftype = xfs_dir3_data_put_ftype, + .data_entry_tag_p = xfs_dir3_data_entry_tag_p, + .data_bestfree_p = xfs_dir3_data_bestfree_p, + + .data_dot_offset = sizeof(struct xfs_dir3_data_hdr), + .data_dotdot_offset = sizeof(struct xfs_dir3_data_hdr) + + XFS_DIR3_DATA_ENTSIZE(1), + .data_first_offset = sizeof(struct xfs_dir3_data_hdr) + + XFS_DIR3_DATA_ENTSIZE(1) + + XFS_DIR3_DATA_ENTSIZE(2), + .data_entry_offset = sizeof(struct xfs_dir3_data_hdr), + + .data_dot_entry_p = xfs_dir3_data_dot_entry_p, + .data_dotdot_entry_p = xfs_dir3_data_dotdot_entry_p, + .data_first_entry_p = xfs_dir3_data_first_entry_p, + .data_entry_p = xfs_dir3_data_entry_p, + .data_unused_p = xfs_dir3_data_unused_p, + + .leaf_hdr_size = sizeof(struct xfs_dir3_leaf_hdr), + .leaf_hdr_to_disk = xfs_dir3_leaf_hdr_to_disk, + .leaf_hdr_from_disk = xfs_dir3_leaf_hdr_from_disk, + .leaf_max_ents = xfs_dir3_max_leaf_ents, + .leaf_ents_p = xfs_dir3_leaf_ents_p, + + .node_hdr_size = sizeof(struct xfs_da3_node_hdr), + .node_hdr_to_disk = xfs_da3_node_hdr_to_disk, + .node_hdr_from_disk = xfs_da3_node_hdr_from_disk, + .node_tree_p = xfs_da3_node_tree_p, + + .free_hdr_size = sizeof(struct xfs_dir3_free_hdr), + .free_hdr_to_disk = xfs_dir3_free_hdr_to_disk, + .free_hdr_from_disk = xfs_dir3_free_hdr_from_disk, + .free_max_bests = xfs_dir3_free_max_bests, + .free_bests_p = xfs_dir3_free_bests_p, + .db_to_fdb = xfs_dir3_db_to_fdb, + .db_to_fdindex = xfs_dir3_db_to_fdindex, +}; + +static const struct xfs_dir_ops xfs_dir2_nondir_ops = { + .node_hdr_size = sizeof(struct xfs_da_node_hdr), + .node_hdr_to_disk = xfs_da2_node_hdr_to_disk, + .node_hdr_from_disk = xfs_da2_node_hdr_from_disk, + .node_tree_p = xfs_da2_node_tree_p, +}; + +static const struct xfs_dir_ops xfs_dir3_nondir_ops = { + .node_hdr_size = sizeof(struct xfs_da3_node_hdr), + .node_hdr_to_disk = xfs_da3_node_hdr_to_disk, + .node_hdr_from_disk = xfs_da3_node_hdr_from_disk, + .node_tree_p = xfs_da3_node_tree_p, +}; + +/* + * Return the ops structure according to the current config. If we are passed + * an inode, then that overrides the default config we use which is based on + * feature bits. + */ +const struct xfs_dir_ops * +xfs_dir_get_ops( + struct xfs_mount *mp, + struct xfs_inode *dp) +{ + if (dp) + return dp->d_ops; + if (mp->m_dir_inode_ops) + return mp->m_dir_inode_ops; + if (xfs_sb_version_hascrc(&mp->m_sb)) + return &xfs_dir3_ops; + if (xfs_sb_version_hasftype(&mp->m_sb)) + return &xfs_dir2_ftype_ops; + return &xfs_dir2_ops; +} + +const struct xfs_dir_ops * +xfs_nondir_get_ops( + struct xfs_mount *mp, + struct xfs_inode *dp) +{ + if (dp) + return dp->d_ops; + if (mp->m_nondir_inode_ops) + return mp->m_nondir_inode_ops; + if (xfs_sb_version_hascrc(&mp->m_sb)) + return &xfs_dir3_nondir_ops; + return &xfs_dir2_nondir_ops; +} diff --git a/kernel/fs/xfs/libxfs/xfs_da_format.h b/kernel/fs/xfs/libxfs/xfs_da_format.h new file mode 100644 index 000000000..74bcbabfa --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_da_format.h @@ -0,0 +1,873 @@ +/* + * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_DA_FORMAT_H__ +#define __XFS_DA_FORMAT_H__ + +/* + * This structure is common to both leaf nodes and non-leaf nodes in the Btree. + * + * It is used to manage a doubly linked list of all blocks at the same + * level in the Btree, and to identify which type of block this is. + */ +#define XFS_DA_NODE_MAGIC 0xfebe /* magic number: non-leaf blocks */ +#define XFS_ATTR_LEAF_MAGIC 0xfbee /* magic number: attribute leaf blks */ +#define XFS_DIR2_LEAF1_MAGIC 0xd2f1 /* magic number: v2 dirlf single blks */ +#define XFS_DIR2_LEAFN_MAGIC 0xd2ff /* magic number: v2 dirlf multi blks */ + +typedef struct xfs_da_blkinfo { + __be32 forw; /* previous block in list */ + __be32 back; /* following block in list */ + __be16 magic; /* validity check on block */ + __be16 pad; /* unused */ +} xfs_da_blkinfo_t; + +/* + * CRC enabled directory structure types + * + * The headers change size for the additional verification information, but + * otherwise the tree layouts and contents are unchanged. Hence the da btree + * code can use the struct xfs_da_blkinfo for manipulating the tree links and + * magic numbers without modification for both v2 and v3 nodes. + */ +#define XFS_DA3_NODE_MAGIC 0x3ebe /* magic number: non-leaf blocks */ +#define XFS_ATTR3_LEAF_MAGIC 0x3bee /* magic number: attribute leaf blks */ +#define XFS_DIR3_LEAF1_MAGIC 0x3df1 /* magic number: v2 dirlf single blks */ +#define XFS_DIR3_LEAFN_MAGIC 0x3dff /* magic number: v2 dirlf multi blks */ + +struct xfs_da3_blkinfo { + /* + * the node link manipulation code relies on the fact that the first + * element of this structure is the struct xfs_da_blkinfo so it can + * ignore the differences in the rest of the structures. + */ + struct xfs_da_blkinfo hdr; + __be32 crc; /* CRC of block */ + __be64 blkno; /* first block of the buffer */ + __be64 lsn; /* sequence number of last write */ + uuid_t uuid; /* filesystem we belong to */ + __be64 owner; /* inode that owns the block */ +}; + +/* + * This is the structure of the root and intermediate nodes in the Btree. + * The leaf nodes are defined above. + * + * Entries are not packed. + * + * Since we have duplicate keys, use a binary search but always follow + * all match in the block, not just the first match found. + */ +#define XFS_DA_NODE_MAXDEPTH 5 /* max depth of Btree */ + +typedef struct xfs_da_node_hdr { + struct xfs_da_blkinfo info; /* block type, links, etc. */ + __be16 __count; /* count of active entries */ + __be16 __level; /* level above leaves (leaf == 0) */ +} xfs_da_node_hdr_t; + +struct xfs_da3_node_hdr { + struct xfs_da3_blkinfo info; /* block type, links, etc. */ + __be16 __count; /* count of active entries */ + __be16 __level; /* level above leaves (leaf == 0) */ + __be32 __pad32; +}; + +#define XFS_DA3_NODE_CRC_OFF (offsetof(struct xfs_da3_node_hdr, info.crc)) + +typedef struct xfs_da_node_entry { + __be32 hashval; /* hash value for this descendant */ + __be32 before; /* Btree block before this key */ +} xfs_da_node_entry_t; + +typedef struct xfs_da_intnode { + struct xfs_da_node_hdr hdr; + struct xfs_da_node_entry __btree[]; +} xfs_da_intnode_t; + +struct xfs_da3_intnode { + struct xfs_da3_node_hdr hdr; + struct xfs_da_node_entry __btree[]; +}; + +/* + * In-core version of the node header to abstract the differences in the v2 and + * v3 disk format of the headers. Callers need to convert to/from disk format as + * appropriate. + */ +struct xfs_da3_icnode_hdr { + __uint32_t forw; + __uint32_t back; + __uint16_t magic; + __uint16_t count; + __uint16_t level; +}; + +/* + * Directory version 2. + * + * There are 4 possible formats: + * - shortform - embedded into the inode + * - single block - data with embedded leaf at the end + * - multiple data blocks, single leaf+freeindex block + * - data blocks, node and leaf blocks (btree), freeindex blocks + * + * Note: many node blocks structures and constants are shared with the attr + * code and defined in xfs_da_btree.h. + */ + +#define XFS_DIR2_BLOCK_MAGIC 0x58443242 /* XD2B: single block dirs */ +#define XFS_DIR2_DATA_MAGIC 0x58443244 /* XD2D: multiblock dirs */ +#define XFS_DIR2_FREE_MAGIC 0x58443246 /* XD2F: free index blocks */ + +/* + * Directory Version 3 With CRCs. + * + * The tree formats are the same as for version 2 directories. The difference + * is in the block header and dirent formats. In many cases the v3 structures + * use v2 definitions as they are no different and this makes code sharing much + * easier. + * + * Also, the xfs_dir3_*() functions handle both v2 and v3 formats - if the + * format is v2 then they switch to the existing v2 code, or the format is v3 + * they implement the v3 functionality. This means the existing dir2 is a mix of + * xfs_dir2/xfs_dir3 calls and functions. The xfs_dir3 functions are called + * where there is a difference in the formats, otherwise the code is unchanged. + * + * Where it is possible, the code decides what to do based on the magic numbers + * in the blocks rather than feature bits in the superblock. This means the code + * is as independent of the external XFS code as possible as doesn't require + * passing struct xfs_mount pointers into places where it isn't really + * necessary. + * + * Version 3 includes: + * + * - a larger block header for CRC and identification purposes and so the + * offsets of all the structures inside the blocks are different. + * + * - new magic numbers to be able to detect the v2/v3 types on the fly. + */ + +#define XFS_DIR3_BLOCK_MAGIC 0x58444233 /* XDB3: single block dirs */ +#define XFS_DIR3_DATA_MAGIC 0x58444433 /* XDD3: multiblock dirs */ +#define XFS_DIR3_FREE_MAGIC 0x58444633 /* XDF3: free index blocks */ + +/* + * Dirents in version 3 directories have a file type field. Additions to this + * list are an on-disk format change, requiring feature bits. Valid values + * are as follows: + */ +#define XFS_DIR3_FT_UNKNOWN 0 +#define XFS_DIR3_FT_REG_FILE 1 +#define XFS_DIR3_FT_DIR 2 +#define XFS_DIR3_FT_CHRDEV 3 +#define XFS_DIR3_FT_BLKDEV 4 +#define XFS_DIR3_FT_FIFO 5 +#define XFS_DIR3_FT_SOCK 6 +#define XFS_DIR3_FT_SYMLINK 7 +#define XFS_DIR3_FT_WHT 8 + +#define XFS_DIR3_FT_MAX 9 + +/* + * Byte offset in data block and shortform entry. + */ +typedef __uint16_t xfs_dir2_data_off_t; +#define NULLDATAOFF 0xffffU +typedef uint xfs_dir2_data_aoff_t; /* argument form */ + +/* + * Normalized offset (in a data block) of the entry, really xfs_dir2_data_off_t. + * Only need 16 bits, this is the byte offset into the single block form. + */ +typedef struct { __uint8_t i[2]; } __arch_pack xfs_dir2_sf_off_t; + +/* + * Offset in data space of a data entry. + */ +typedef __uint32_t xfs_dir2_dataptr_t; +#define XFS_DIR2_MAX_DATAPTR ((xfs_dir2_dataptr_t)0xffffffff) +#define XFS_DIR2_NULL_DATAPTR ((xfs_dir2_dataptr_t)0) + +/* + * Byte offset in a directory. + */ +typedef xfs_off_t xfs_dir2_off_t; + +/* + * Directory block number (logical dirblk in file) + */ +typedef __uint32_t xfs_dir2_db_t; + +/* + * Inode number stored as 8 8-bit values. + */ +typedef struct { __uint8_t i[8]; } xfs_dir2_ino8_t; + +/* + * Inode number stored as 4 8-bit values. + * Works a lot of the time, when all the inode numbers in a directory + * fit in 32 bits. + */ +typedef struct { __uint8_t i[4]; } xfs_dir2_ino4_t; + +typedef union { + xfs_dir2_ino8_t i8; + xfs_dir2_ino4_t i4; +} xfs_dir2_inou_t; +#define XFS_DIR2_MAX_SHORT_INUM ((xfs_ino_t)0xffffffffULL) + +/* + * Directory layout when stored internal to an inode. + * + * Small directories are packed as tightly as possible so as to fit into the + * literal area of the inode. These "shortform" directories consist of a + * single xfs_dir2_sf_hdr header followed by zero or more xfs_dir2_sf_entry + * structures. Due the different inode number storage size and the variable + * length name field in the xfs_dir2_sf_entry all these structure are + * variable length, and the accessors in this file should be used to iterate + * over them. + */ +typedef struct xfs_dir2_sf_hdr { + __uint8_t count; /* count of entries */ + __uint8_t i8count; /* count of 8-byte inode #s */ + xfs_dir2_inou_t parent; /* parent dir inode number */ +} __arch_pack xfs_dir2_sf_hdr_t; + +typedef struct xfs_dir2_sf_entry { + __u8 namelen; /* actual name length */ + xfs_dir2_sf_off_t offset; /* saved offset */ + __u8 name[]; /* name, variable size */ + /* + * A single byte containing the file type field follows the inode + * number for version 3 directory entries. + * + * A xfs_dir2_ino8_t or xfs_dir2_ino4_t follows here, at a + * variable offset after the name. + */ +} __arch_pack xfs_dir2_sf_entry_t; + +static inline int xfs_dir2_sf_hdr_size(int i8count) +{ + return sizeof(struct xfs_dir2_sf_hdr) - + (i8count == 0) * + (sizeof(xfs_dir2_ino8_t) - sizeof(xfs_dir2_ino4_t)); +} + +static inline xfs_dir2_data_aoff_t +xfs_dir2_sf_get_offset(xfs_dir2_sf_entry_t *sfep) +{ + return get_unaligned_be16(&sfep->offset.i); +} + +static inline void +xfs_dir2_sf_put_offset(xfs_dir2_sf_entry_t *sfep, xfs_dir2_data_aoff_t off) +{ + put_unaligned_be16(off, &sfep->offset.i); +} + +static inline struct xfs_dir2_sf_entry * +xfs_dir2_sf_firstentry(struct xfs_dir2_sf_hdr *hdr) +{ + return (struct xfs_dir2_sf_entry *) + ((char *)hdr + xfs_dir2_sf_hdr_size(hdr->i8count)); +} + +/* + * Data block structures. + * + * A pure data block looks like the following drawing on disk: + * + * +-------------------------------------------------+ + * | xfs_dir2_data_hdr_t | + * +-------------------------------------------------+ + * | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t | + * | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t | + * | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t | + * | ... | + * +-------------------------------------------------+ + * | unused space | + * +-------------------------------------------------+ + * + * As all the entries are variable size structures the accessors below should + * be used to iterate over them. + * + * In addition to the pure data blocks for the data and node formats, + * most structures are also used for the combined data/freespace "block" + * format below. + */ + +#define XFS_DIR2_DATA_ALIGN_LOG 3 /* i.e., 8 bytes */ +#define XFS_DIR2_DATA_ALIGN (1 << XFS_DIR2_DATA_ALIGN_LOG) +#define XFS_DIR2_DATA_FREE_TAG 0xffff +#define XFS_DIR2_DATA_FD_COUNT 3 + +/* + * Directory address space divided into sections, + * spaces separated by 32GB. + */ +#define XFS_DIR2_SPACE_SIZE (1ULL << (32 + XFS_DIR2_DATA_ALIGN_LOG)) +#define XFS_DIR2_DATA_SPACE 0 +#define XFS_DIR2_DATA_OFFSET (XFS_DIR2_DATA_SPACE * XFS_DIR2_SPACE_SIZE) + +/* + * Describe a free area in the data block. + * + * The freespace will be formatted as a xfs_dir2_data_unused_t. + */ +typedef struct xfs_dir2_data_free { + __be16 offset; /* start of freespace */ + __be16 length; /* length of freespace */ +} xfs_dir2_data_free_t; + +/* + * Header for the data blocks. + * + * The code knows that XFS_DIR2_DATA_FD_COUNT is 3. + */ +typedef struct xfs_dir2_data_hdr { + __be32 magic; /* XFS_DIR2_DATA_MAGIC or */ + /* XFS_DIR2_BLOCK_MAGIC */ + xfs_dir2_data_free_t bestfree[XFS_DIR2_DATA_FD_COUNT]; +} xfs_dir2_data_hdr_t; + +/* + * define a structure for all the verification fields we are adding to the + * directory block structures. This will be used in several structures. + * The magic number must be the first entry to align with all the dir2 + * structures so we determine how to decode them just by the magic number. + */ +struct xfs_dir3_blk_hdr { + __be32 magic; /* magic number */ + __be32 crc; /* CRC of block */ + __be64 blkno; /* first block of the buffer */ + __be64 lsn; /* sequence number of last write */ + uuid_t uuid; /* filesystem we belong to */ + __be64 owner; /* inode that owns the block */ +}; + +struct xfs_dir3_data_hdr { + struct xfs_dir3_blk_hdr hdr; + xfs_dir2_data_free_t best_free[XFS_DIR2_DATA_FD_COUNT]; + __be32 pad; /* 64 bit alignment */ +}; + +#define XFS_DIR3_DATA_CRC_OFF offsetof(struct xfs_dir3_data_hdr, hdr.crc) + +/* + * Active entry in a data block. + * + * Aligned to 8 bytes. After the variable length name field there is a + * 2 byte tag field, which can be accessed using xfs_dir3_data_entry_tag_p. + * + * For dir3 structures, there is file type field between the name and the tag. + * This can only be manipulated by helper functions. It is packed hard against + * the end of the name so any padding for rounding is between the file type and + * the tag. + */ +typedef struct xfs_dir2_data_entry { + __be64 inumber; /* inode number */ + __u8 namelen; /* name length */ + __u8 name[]; /* name bytes, no null */ + /* __u8 filetype; */ /* type of inode we point to */ + /* __be16 tag; */ /* starting offset of us */ +} xfs_dir2_data_entry_t; + +/* + * Unused entry in a data block. + * + * Aligned to 8 bytes. Tag appears as the last 2 bytes and must be accessed + * using xfs_dir2_data_unused_tag_p. + */ +typedef struct xfs_dir2_data_unused { + __be16 freetag; /* XFS_DIR2_DATA_FREE_TAG */ + __be16 length; /* total free length */ + /* variable offset */ + __be16 tag; /* starting offset of us */ +} xfs_dir2_data_unused_t; + +/* + * Pointer to a freespace's tag word. + */ +static inline __be16 * +xfs_dir2_data_unused_tag_p(struct xfs_dir2_data_unused *dup) +{ + return (__be16 *)((char *)dup + + be16_to_cpu(dup->length) - sizeof(__be16)); +} + +/* + * Leaf block structures. + * + * A pure leaf block looks like the following drawing on disk: + * + * +---------------------------+ + * | xfs_dir2_leaf_hdr_t | + * +---------------------------+ + * | xfs_dir2_leaf_entry_t | + * | xfs_dir2_leaf_entry_t | + * | xfs_dir2_leaf_entry_t | + * | xfs_dir2_leaf_entry_t | + * | ... | + * +---------------------------+ + * | xfs_dir2_data_off_t | + * | xfs_dir2_data_off_t | + * | xfs_dir2_data_off_t | + * | ... | + * +---------------------------+ + * | xfs_dir2_leaf_tail_t | + * +---------------------------+ + * + * The xfs_dir2_data_off_t members (bests) and tail are at the end of the block + * for single-leaf (magic = XFS_DIR2_LEAF1_MAGIC) blocks only, but not present + * for directories with separate leaf nodes and free space blocks + * (magic = XFS_DIR2_LEAFN_MAGIC). + * + * As all the entries are variable size structures the accessors below should + * be used to iterate over them. + */ + +/* + * Offset of the leaf/node space. First block in this space + * is the btree root. + */ +#define XFS_DIR2_LEAF_SPACE 1 +#define XFS_DIR2_LEAF_OFFSET (XFS_DIR2_LEAF_SPACE * XFS_DIR2_SPACE_SIZE) + +/* + * Leaf block header. + */ +typedef struct xfs_dir2_leaf_hdr { + xfs_da_blkinfo_t info; /* header for da routines */ + __be16 count; /* count of entries */ + __be16 stale; /* count of stale entries */ +} xfs_dir2_leaf_hdr_t; + +struct xfs_dir3_leaf_hdr { + struct xfs_da3_blkinfo info; /* header for da routines */ + __be16 count; /* count of entries */ + __be16 stale; /* count of stale entries */ + __be32 pad; /* 64 bit alignment */ +}; + +struct xfs_dir3_icleaf_hdr { + __uint32_t forw; + __uint32_t back; + __uint16_t magic; + __uint16_t count; + __uint16_t stale; +}; + +/* + * Leaf block entry. + */ +typedef struct xfs_dir2_leaf_entry { + __be32 hashval; /* hash value of name */ + __be32 address; /* address of data entry */ +} xfs_dir2_leaf_entry_t; + +/* + * Leaf block tail. + */ +typedef struct xfs_dir2_leaf_tail { + __be32 bestcount; +} xfs_dir2_leaf_tail_t; + +/* + * Leaf block. + */ +typedef struct xfs_dir2_leaf { + xfs_dir2_leaf_hdr_t hdr; /* leaf header */ + xfs_dir2_leaf_entry_t __ents[]; /* entries */ +} xfs_dir2_leaf_t; + +struct xfs_dir3_leaf { + struct xfs_dir3_leaf_hdr hdr; /* leaf header */ + struct xfs_dir2_leaf_entry __ents[]; /* entries */ +}; + +#define XFS_DIR3_LEAF_CRC_OFF offsetof(struct xfs_dir3_leaf_hdr, info.crc) + +/* + * Get address of the bests array in the single-leaf block. + */ +static inline __be16 * +xfs_dir2_leaf_bests_p(struct xfs_dir2_leaf_tail *ltp) +{ + return (__be16 *)ltp - be32_to_cpu(ltp->bestcount); +} + +/* + * Free space block defintions for the node format. + */ + +/* + * Offset of the freespace index. + */ +#define XFS_DIR2_FREE_SPACE 2 +#define XFS_DIR2_FREE_OFFSET (XFS_DIR2_FREE_SPACE * XFS_DIR2_SPACE_SIZE) + +typedef struct xfs_dir2_free_hdr { + __be32 magic; /* XFS_DIR2_FREE_MAGIC */ + __be32 firstdb; /* db of first entry */ + __be32 nvalid; /* count of valid entries */ + __be32 nused; /* count of used entries */ +} xfs_dir2_free_hdr_t; + +typedef struct xfs_dir2_free { + xfs_dir2_free_hdr_t hdr; /* block header */ + __be16 bests[]; /* best free counts */ + /* unused entries are -1 */ +} xfs_dir2_free_t; + +struct xfs_dir3_free_hdr { + struct xfs_dir3_blk_hdr hdr; + __be32 firstdb; /* db of first entry */ + __be32 nvalid; /* count of valid entries */ + __be32 nused; /* count of used entries */ + __be32 pad; /* 64 bit alignment */ +}; + +struct xfs_dir3_free { + struct xfs_dir3_free_hdr hdr; + __be16 bests[]; /* best free counts */ + /* unused entries are -1 */ +}; + +#define XFS_DIR3_FREE_CRC_OFF offsetof(struct xfs_dir3_free, hdr.hdr.crc) + +/* + * In core version of the free block header, abstracted away from on-disk format + * differences. Use this in the code, and convert to/from the disk version using + * xfs_dir3_free_hdr_from_disk/xfs_dir3_free_hdr_to_disk. + */ +struct xfs_dir3_icfree_hdr { + __uint32_t magic; + __uint32_t firstdb; + __uint32_t nvalid; + __uint32_t nused; + +}; + +/* + * Single block format. + * + * The single block format looks like the following drawing on disk: + * + * +-------------------------------------------------+ + * | xfs_dir2_data_hdr_t | + * +-------------------------------------------------+ + * | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t | + * | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t | + * | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t : + * | ... | + * +-------------------------------------------------+ + * | unused space | + * +-------------------------------------------------+ + * | ... | + * | xfs_dir2_leaf_entry_t | + * | xfs_dir2_leaf_entry_t | + * +-------------------------------------------------+ + * | xfs_dir2_block_tail_t | + * +-------------------------------------------------+ + * + * As all the entries are variable size structures the accessors below should + * be used to iterate over them. + */ + +typedef struct xfs_dir2_block_tail { + __be32 count; /* count of leaf entries */ + __be32 stale; /* count of stale lf entries */ +} xfs_dir2_block_tail_t; + +/* + * Pointer to the leaf entries embedded in a data block (1-block format) + */ +static inline struct xfs_dir2_leaf_entry * +xfs_dir2_block_leaf_p(struct xfs_dir2_block_tail *btp) +{ + return ((struct xfs_dir2_leaf_entry *)btp) - be32_to_cpu(btp->count); +} + + +/* + * Attribute storage layout + * + * Attribute lists are structured around Btrees where all the data + * elements are in the leaf nodes. Attribute names are hashed into an int, + * then that int is used as the index into the Btree. Since the hashval + * of an attribute name may not be unique, we may have duplicate keys. The + * internal links in the Btree are logical block offsets into the file. + * + * Struct leaf_entry's are packed from the top. Name/values grow from the + * bottom but are not packed. The freemap contains run-length-encoded entries + * for the free bytes after the leaf_entry's, but only the N largest such, + * smaller runs are dropped. When the freemap doesn't show enough space + * for an allocation, we compact the name/value area and try again. If we + * still don't have enough space, then we have to split the block. The + * name/value structs (both local and remote versions) must be 32bit aligned. + * + * Since we have duplicate hash keys, for each key that matches, compare + * the actual name string. The root and intermediate node search always + * takes the first-in-the-block key match found, so we should only have + * to work "forw"ard. If none matches, continue with the "forw"ard leaf + * nodes until the hash key changes or the attribute name is found. + * + * We store the fact that an attribute is a ROOT/USER/SECURE attribute in + * the leaf_entry. The namespaces are independent only because we also look + * at the namespace bit when we are looking for a matching attribute name. + * + * We also store an "incomplete" bit in the leaf_entry. It shows that an + * attribute is in the middle of being created and should not be shown to + * the user if we crash during the time that the bit is set. We clear the + * bit when we have finished setting up the attribute. We do this because + * we cannot create some large attributes inside a single transaction, and we + * need some indication that we weren't finished if we crash in the middle. + */ +#define XFS_ATTR_LEAF_MAPSIZE 3 /* how many freespace slots */ + +typedef struct xfs_attr_leaf_map { /* RLE map of free bytes */ + __be16 base; /* base of free region */ + __be16 size; /* length of free region */ +} xfs_attr_leaf_map_t; + +typedef struct xfs_attr_leaf_hdr { /* constant-structure header block */ + xfs_da_blkinfo_t info; /* block type, links, etc. */ + __be16 count; /* count of active leaf_entry's */ + __be16 usedbytes; /* num bytes of names/values stored */ + __be16 firstused; /* first used byte in name area */ + __u8 holes; /* != 0 if blk needs compaction */ + __u8 pad1; + xfs_attr_leaf_map_t freemap[XFS_ATTR_LEAF_MAPSIZE]; + /* N largest free regions */ +} xfs_attr_leaf_hdr_t; + +typedef struct xfs_attr_leaf_entry { /* sorted on key, not name */ + __be32 hashval; /* hash value of name */ + __be16 nameidx; /* index into buffer of name/value */ + __u8 flags; /* LOCAL/ROOT/SECURE/INCOMPLETE flag */ + __u8 pad2; /* unused pad byte */ +} xfs_attr_leaf_entry_t; + +typedef struct xfs_attr_leaf_name_local { + __be16 valuelen; /* number of bytes in value */ + __u8 namelen; /* length of name bytes */ + __u8 nameval[1]; /* name/value bytes */ +} xfs_attr_leaf_name_local_t; + +typedef struct xfs_attr_leaf_name_remote { + __be32 valueblk; /* block number of value bytes */ + __be32 valuelen; /* number of bytes in value */ + __u8 namelen; /* length of name bytes */ + __u8 name[1]; /* name bytes */ +} xfs_attr_leaf_name_remote_t; + +typedef struct xfs_attr_leafblock { + xfs_attr_leaf_hdr_t hdr; /* constant-structure header block */ + xfs_attr_leaf_entry_t entries[1]; /* sorted on key, not name */ + xfs_attr_leaf_name_local_t namelist; /* grows from bottom of buf */ + xfs_attr_leaf_name_remote_t valuelist; /* grows from bottom of buf */ +} xfs_attr_leafblock_t; + +/* + * CRC enabled leaf structures. Called "version 3" structures to match the + * version number of the directory and dablk structures for this feature, and + * attr2 is already taken by the variable inode attribute fork size feature. + */ +struct xfs_attr3_leaf_hdr { + struct xfs_da3_blkinfo info; + __be16 count; + __be16 usedbytes; + __be16 firstused; + __u8 holes; + __u8 pad1; + struct xfs_attr_leaf_map freemap[XFS_ATTR_LEAF_MAPSIZE]; + __be32 pad2; /* 64 bit alignment */ +}; + +#define XFS_ATTR3_LEAF_CRC_OFF (offsetof(struct xfs_attr3_leaf_hdr, info.crc)) + +struct xfs_attr3_leafblock { + struct xfs_attr3_leaf_hdr hdr; + struct xfs_attr_leaf_entry entries[1]; + + /* + * The rest of the block contains the following structures after the + * leaf entries, growing from the bottom up. The variables are never + * referenced, the locations accessed purely from helper functions. + * + * struct xfs_attr_leaf_name_local + * struct xfs_attr_leaf_name_remote + */ +}; + +/* + * incore, neutral version of the attribute leaf header + */ +struct xfs_attr3_icleaf_hdr { + __uint32_t forw; + __uint32_t back; + __uint16_t magic; + __uint16_t count; + __uint16_t usedbytes; + /* + * firstused is 32-bit here instead of 16-bit like the on-disk variant + * to support maximum fsb size of 64k without overflow issues throughout + * the attr code. Instead, the overflow condition is handled on + * conversion to/from disk. + */ + __uint32_t firstused; + __u8 holes; + struct { + __uint16_t base; + __uint16_t size; + } freemap[XFS_ATTR_LEAF_MAPSIZE]; +}; + +/* + * Special value to represent fs block size in the leaf header firstused field. + * Only used when block size overflows the 2-bytes available on disk. + */ +#define XFS_ATTR3_LEAF_NULLOFF 0 + +/* + * Flags used in the leaf_entry[i].flags field. + * NOTE: the INCOMPLETE bit must not collide with the flags bits specified + * on the system call, they are "or"ed together for various operations. + */ +#define XFS_ATTR_LOCAL_BIT 0 /* attr is stored locally */ +#define XFS_ATTR_ROOT_BIT 1 /* limit access to trusted attrs */ +#define XFS_ATTR_SECURE_BIT 2 /* limit access to secure attrs */ +#define XFS_ATTR_INCOMPLETE_BIT 7 /* attr in middle of create/delete */ +#define XFS_ATTR_LOCAL (1 << XFS_ATTR_LOCAL_BIT) +#define XFS_ATTR_ROOT (1 << XFS_ATTR_ROOT_BIT) +#define XFS_ATTR_SECURE (1 << XFS_ATTR_SECURE_BIT) +#define XFS_ATTR_INCOMPLETE (1 << XFS_ATTR_INCOMPLETE_BIT) + +/* + * Conversion macros for converting namespace bits from argument flags + * to ondisk flags. + */ +#define XFS_ATTR_NSP_ARGS_MASK (ATTR_ROOT | ATTR_SECURE) +#define XFS_ATTR_NSP_ONDISK_MASK (XFS_ATTR_ROOT | XFS_ATTR_SECURE) +#define XFS_ATTR_NSP_ONDISK(flags) ((flags) & XFS_ATTR_NSP_ONDISK_MASK) +#define XFS_ATTR_NSP_ARGS(flags) ((flags) & XFS_ATTR_NSP_ARGS_MASK) +#define XFS_ATTR_NSP_ARGS_TO_ONDISK(x) (((x) & ATTR_ROOT ? XFS_ATTR_ROOT : 0) |\ + ((x) & ATTR_SECURE ? XFS_ATTR_SECURE : 0)) +#define XFS_ATTR_NSP_ONDISK_TO_ARGS(x) (((x) & XFS_ATTR_ROOT ? ATTR_ROOT : 0) |\ + ((x) & XFS_ATTR_SECURE ? ATTR_SECURE : 0)) + +/* + * Alignment for namelist and valuelist entries (since they are mixed + * there can be only one alignment value) + */ +#define XFS_ATTR_LEAF_NAME_ALIGN ((uint)sizeof(xfs_dablk_t)) + +static inline int +xfs_attr3_leaf_hdr_size(struct xfs_attr_leafblock *leafp) +{ + if (leafp->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)) + return sizeof(struct xfs_attr3_leaf_hdr); + return sizeof(struct xfs_attr_leaf_hdr); +} + +static inline struct xfs_attr_leaf_entry * +xfs_attr3_leaf_entryp(xfs_attr_leafblock_t *leafp) +{ + if (leafp->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)) + return &((struct xfs_attr3_leafblock *)leafp)->entries[0]; + return &leafp->entries[0]; +} + +/* + * Cast typed pointers for "local" and "remote" name/value structs. + */ +static inline char * +xfs_attr3_leaf_name(xfs_attr_leafblock_t *leafp, int idx) +{ + struct xfs_attr_leaf_entry *entries = xfs_attr3_leaf_entryp(leafp); + + return &((char *)leafp)[be16_to_cpu(entries[idx].nameidx)]; +} + +static inline xfs_attr_leaf_name_remote_t * +xfs_attr3_leaf_name_remote(xfs_attr_leafblock_t *leafp, int idx) +{ + return (xfs_attr_leaf_name_remote_t *)xfs_attr3_leaf_name(leafp, idx); +} + +static inline xfs_attr_leaf_name_local_t * +xfs_attr3_leaf_name_local(xfs_attr_leafblock_t *leafp, int idx) +{ + return (xfs_attr_leaf_name_local_t *)xfs_attr3_leaf_name(leafp, idx); +} + +/* + * Calculate total bytes used (including trailing pad for alignment) for + * a "local" name/value structure, a "remote" name/value structure, and + * a pointer which might be either. + */ +static inline int xfs_attr_leaf_entsize_remote(int nlen) +{ + return ((uint)sizeof(xfs_attr_leaf_name_remote_t) - 1 + (nlen) + \ + XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1); +} + +static inline int xfs_attr_leaf_entsize_local(int nlen, int vlen) +{ + return ((uint)sizeof(xfs_attr_leaf_name_local_t) - 1 + (nlen) + (vlen) + + XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1); +} + +static inline int xfs_attr_leaf_entsize_local_max(int bsize) +{ + return (((bsize) >> 1) + ((bsize) >> 2)); +} + + + +/* + * Remote attribute block format definition + * + * There is one of these headers per filesystem block in a remote attribute. + * This is done to ensure there is a 1:1 mapping between the attribute value + * length and the number of blocks needed to store the attribute. This makes the + * verification of a buffer a little more complex, but greatly simplifies the + * allocation, reading and writing of these attributes as we don't have to guess + * the number of blocks needed to store the attribute data. + */ +#define XFS_ATTR3_RMT_MAGIC 0x5841524d /* XARM */ + +struct xfs_attr3_rmt_hdr { + __be32 rm_magic; + __be32 rm_offset; + __be32 rm_bytes; + __be32 rm_crc; + uuid_t rm_uuid; + __be64 rm_owner; + __be64 rm_blkno; + __be64 rm_lsn; +}; + +#define XFS_ATTR3_RMT_CRC_OFF offsetof(struct xfs_attr3_rmt_hdr, rm_crc) + +#define XFS_ATTR3_RMT_BUF_SPACE(mp, bufsize) \ + ((bufsize) - (xfs_sb_version_hascrc(&(mp)->m_sb) ? \ + sizeof(struct xfs_attr3_rmt_hdr) : 0)) + +#endif /* __XFS_DA_FORMAT_H__ */ diff --git a/kernel/fs/xfs/libxfs/xfs_dir2.c b/kernel/fs/xfs/libxfs/xfs_dir2.c new file mode 100644 index 000000000..a69fb3a1e --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_dir2.c @@ -0,0 +1,731 @@ +/* + * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_inode_item.h" +#include "xfs_bmap.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" +#include "xfs_error.h" +#include "xfs_trace.h" + +struct xfs_name xfs_name_dotdot = { (unsigned char *)"..", 2, XFS_DIR3_FT_DIR }; + +/* + * @mode, if set, indicates that the type field needs to be set up. + * This uses the transformation from file mode to DT_* as defined in linux/fs.h + * for file type specification. This will be propagated into the directory + * structure if appropriate for the given operation and filesystem config. + */ +const unsigned char xfs_mode_to_ftype[S_IFMT >> S_SHIFT] = { + [0] = XFS_DIR3_FT_UNKNOWN, + [S_IFREG >> S_SHIFT] = XFS_DIR3_FT_REG_FILE, + [S_IFDIR >> S_SHIFT] = XFS_DIR3_FT_DIR, + [S_IFCHR >> S_SHIFT] = XFS_DIR3_FT_CHRDEV, + [S_IFBLK >> S_SHIFT] = XFS_DIR3_FT_BLKDEV, + [S_IFIFO >> S_SHIFT] = XFS_DIR3_FT_FIFO, + [S_IFSOCK >> S_SHIFT] = XFS_DIR3_FT_SOCK, + [S_IFLNK >> S_SHIFT] = XFS_DIR3_FT_SYMLINK, +}; + +/* + * ASCII case-insensitive (ie. A-Z) support for directories that was + * used in IRIX. + */ +STATIC xfs_dahash_t +xfs_ascii_ci_hashname( + struct xfs_name *name) +{ + xfs_dahash_t hash; + int i; + + for (i = 0, hash = 0; i < name->len; i++) + hash = tolower(name->name[i]) ^ rol32(hash, 7); + + return hash; +} + +STATIC enum xfs_dacmp +xfs_ascii_ci_compname( + struct xfs_da_args *args, + const unsigned char *name, + int len) +{ + enum xfs_dacmp result; + int i; + + if (args->namelen != len) + return XFS_CMP_DIFFERENT; + + result = XFS_CMP_EXACT; + for (i = 0; i < len; i++) { + if (args->name[i] == name[i]) + continue; + if (tolower(args->name[i]) != tolower(name[i])) + return XFS_CMP_DIFFERENT; + result = XFS_CMP_CASE; + } + + return result; +} + +static struct xfs_nameops xfs_ascii_ci_nameops = { + .hashname = xfs_ascii_ci_hashname, + .compname = xfs_ascii_ci_compname, +}; + +int +xfs_da_mount( + struct xfs_mount *mp) +{ + struct xfs_da_geometry *dageo; + int nodehdr_size; + + + ASSERT(mp->m_sb.sb_versionnum & XFS_SB_VERSION_DIRV2BIT); + ASSERT((1 << (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog)) <= + XFS_MAX_BLOCKSIZE); + + mp->m_dir_inode_ops = xfs_dir_get_ops(mp, NULL); + mp->m_nondir_inode_ops = xfs_nondir_get_ops(mp, NULL); + + nodehdr_size = mp->m_dir_inode_ops->node_hdr_size; + mp->m_dir_geo = kmem_zalloc(sizeof(struct xfs_da_geometry), + KM_SLEEP | KM_MAYFAIL); + mp->m_attr_geo = kmem_zalloc(sizeof(struct xfs_da_geometry), + KM_SLEEP | KM_MAYFAIL); + if (!mp->m_dir_geo || !mp->m_attr_geo) { + kmem_free(mp->m_dir_geo); + kmem_free(mp->m_attr_geo); + return -ENOMEM; + } + + /* set up directory geometry */ + dageo = mp->m_dir_geo; + dageo->blklog = mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog; + dageo->fsblog = mp->m_sb.sb_blocklog; + dageo->blksize = 1 << dageo->blklog; + dageo->fsbcount = 1 << mp->m_sb.sb_dirblklog; + + /* + * Now we've set up the block conversion variables, we can calculate the + * segment block constants using the geometry structure. + */ + dageo->datablk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_DATA_OFFSET); + dageo->leafblk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_LEAF_OFFSET); + dageo->freeblk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_FREE_OFFSET); + dageo->node_ents = (dageo->blksize - nodehdr_size) / + (uint)sizeof(xfs_da_node_entry_t); + dageo->magicpct = (dageo->blksize * 37) / 100; + + /* set up attribute geometry - single fsb only */ + dageo = mp->m_attr_geo; + dageo->blklog = mp->m_sb.sb_blocklog; + dageo->fsblog = mp->m_sb.sb_blocklog; + dageo->blksize = 1 << dageo->blklog; + dageo->fsbcount = 1; + dageo->node_ents = (dageo->blksize - nodehdr_size) / + (uint)sizeof(xfs_da_node_entry_t); + dageo->magicpct = (dageo->blksize * 37) / 100; + + if (xfs_sb_version_hasasciici(&mp->m_sb)) + mp->m_dirnameops = &xfs_ascii_ci_nameops; + else + mp->m_dirnameops = &xfs_default_nameops; + + return 0; +} + +void +xfs_da_unmount( + struct xfs_mount *mp) +{ + kmem_free(mp->m_dir_geo); + kmem_free(mp->m_attr_geo); +} + +/* + * Return 1 if directory contains only "." and "..". + */ +int +xfs_dir_isempty( + xfs_inode_t *dp) +{ + xfs_dir2_sf_hdr_t *sfp; + + ASSERT(S_ISDIR(dp->i_d.di_mode)); + if (dp->i_d.di_size == 0) /* might happen during shutdown. */ + return 1; + if (dp->i_d.di_size > XFS_IFORK_DSIZE(dp)) + return 0; + sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + return !sfp->count; +} + +/* + * Validate a given inode number. + */ +int +xfs_dir_ino_validate( + xfs_mount_t *mp, + xfs_ino_t ino) +{ + xfs_agblock_t agblkno; + xfs_agino_t agino; + xfs_agnumber_t agno; + int ino_ok; + int ioff; + + agno = XFS_INO_TO_AGNO(mp, ino); + agblkno = XFS_INO_TO_AGBNO(mp, ino); + ioff = XFS_INO_TO_OFFSET(mp, ino); + agino = XFS_OFFBNO_TO_AGINO(mp, agblkno, ioff); + ino_ok = + agno < mp->m_sb.sb_agcount && + agblkno < mp->m_sb.sb_agblocks && + agblkno != 0 && + ioff < (1 << mp->m_sb.sb_inopblog) && + XFS_AGINO_TO_INO(mp, agno, agino) == ino; + if (unlikely(XFS_TEST_ERROR(!ino_ok, mp, XFS_ERRTAG_DIR_INO_VALIDATE, + XFS_RANDOM_DIR_INO_VALIDATE))) { + xfs_warn(mp, "Invalid inode number 0x%Lx", + (unsigned long long) ino); + XFS_ERROR_REPORT("xfs_dir_ino_validate", XFS_ERRLEVEL_LOW, mp); + return -EFSCORRUPTED; + } + return 0; +} + +/* + * Initialize a directory with its "." and ".." entries. + */ +int +xfs_dir_init( + xfs_trans_t *tp, + xfs_inode_t *dp, + xfs_inode_t *pdp) +{ + struct xfs_da_args *args; + int error; + + ASSERT(S_ISDIR(dp->i_d.di_mode)); + error = xfs_dir_ino_validate(tp->t_mountp, pdp->i_ino); + if (error) + return error; + + args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); + if (!args) + return -ENOMEM; + + args->geo = dp->i_mount->m_dir_geo; + args->dp = dp; + args->trans = tp; + error = xfs_dir2_sf_create(args, pdp->i_ino); + kmem_free(args); + return error; +} + +/* + * Enter a name in a directory, or check for available space. + * If inum is 0, only the available space test is performed. + */ +int +xfs_dir_createname( + xfs_trans_t *tp, + xfs_inode_t *dp, + struct xfs_name *name, + xfs_ino_t inum, /* new entry inode number */ + xfs_fsblock_t *first, /* bmap's firstblock */ + xfs_bmap_free_t *flist, /* bmap's freeblock list */ + xfs_extlen_t total) /* bmap's total block count */ +{ + struct xfs_da_args *args; + int rval; + int v; /* type-checking value */ + + ASSERT(S_ISDIR(dp->i_d.di_mode)); + if (inum) { + rval = xfs_dir_ino_validate(tp->t_mountp, inum); + if (rval) + return rval; + XFS_STATS_INC(xs_dir_create); + } + + args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); + if (!args) + return -ENOMEM; + + args->geo = dp->i_mount->m_dir_geo; + args->name = name->name; + args->namelen = name->len; + args->filetype = name->type; + args->hashval = dp->i_mount->m_dirnameops->hashname(name); + args->inumber = inum; + args->dp = dp; + args->firstblock = first; + args->flist = flist; + args->total = total; + args->whichfork = XFS_DATA_FORK; + args->trans = tp; + args->op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT; + if (!inum) + args->op_flags |= XFS_DA_OP_JUSTCHECK; + + if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) { + rval = xfs_dir2_sf_addname(args); + goto out_free; + } + + rval = xfs_dir2_isblock(args, &v); + if (rval) + goto out_free; + if (v) { + rval = xfs_dir2_block_addname(args); + goto out_free; + } + + rval = xfs_dir2_isleaf(args, &v); + if (rval) + goto out_free; + if (v) + rval = xfs_dir2_leaf_addname(args); + else + rval = xfs_dir2_node_addname(args); + +out_free: + kmem_free(args); + return rval; +} + +/* + * If doing a CI lookup and case-insensitive match, dup actual name into + * args.value. Return EEXIST for success (ie. name found) or an error. + */ +int +xfs_dir_cilookup_result( + struct xfs_da_args *args, + const unsigned char *name, + int len) +{ + if (args->cmpresult == XFS_CMP_DIFFERENT) + return -ENOENT; + if (args->cmpresult != XFS_CMP_CASE || + !(args->op_flags & XFS_DA_OP_CILOOKUP)) + return -EEXIST; + + args->value = kmem_alloc(len, KM_NOFS | KM_MAYFAIL); + if (!args->value) + return -ENOMEM; + + memcpy(args->value, name, len); + args->valuelen = len; + return -EEXIST; +} + +/* + * Lookup a name in a directory, give back the inode number. + * If ci_name is not NULL, returns the actual name in ci_name if it differs + * to name, or ci_name->name is set to NULL for an exact match. + */ + +int +xfs_dir_lookup( + xfs_trans_t *tp, + xfs_inode_t *dp, + struct xfs_name *name, + xfs_ino_t *inum, /* out: inode number */ + struct xfs_name *ci_name) /* out: actual name if CI match */ +{ + struct xfs_da_args *args; + int rval; + int v; /* type-checking value */ + + ASSERT(S_ISDIR(dp->i_d.di_mode)); + XFS_STATS_INC(xs_dir_lookup); + + /* + * We need to use KM_NOFS here so that lockdep will not throw false + * positive deadlock warnings on a non-transactional lookup path. It is + * safe to recurse into inode recalim in that case, but lockdep can't + * easily be taught about it. Hence KM_NOFS avoids having to add more + * lockdep Doing this avoids having to add a bunch of lockdep class + * annotations into the reclaim path for the ilock. + */ + args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); + args->geo = dp->i_mount->m_dir_geo; + args->name = name->name; + args->namelen = name->len; + args->filetype = name->type; + args->hashval = dp->i_mount->m_dirnameops->hashname(name); + args->dp = dp; + args->whichfork = XFS_DATA_FORK; + args->trans = tp; + args->op_flags = XFS_DA_OP_OKNOENT; + if (ci_name) + args->op_flags |= XFS_DA_OP_CILOOKUP; + + if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) { + rval = xfs_dir2_sf_lookup(args); + goto out_check_rval; + } + + rval = xfs_dir2_isblock(args, &v); + if (rval) + goto out_free; + if (v) { + rval = xfs_dir2_block_lookup(args); + goto out_check_rval; + } + + rval = xfs_dir2_isleaf(args, &v); + if (rval) + goto out_free; + if (v) + rval = xfs_dir2_leaf_lookup(args); + else + rval = xfs_dir2_node_lookup(args); + +out_check_rval: + if (rval == -EEXIST) + rval = 0; + if (!rval) { + *inum = args->inumber; + if (ci_name) { + ci_name->name = args->value; + ci_name->len = args->valuelen; + } + } +out_free: + kmem_free(args); + return rval; +} + +/* + * Remove an entry from a directory. + */ +int +xfs_dir_removename( + xfs_trans_t *tp, + xfs_inode_t *dp, + struct xfs_name *name, + xfs_ino_t ino, + xfs_fsblock_t *first, /* bmap's firstblock */ + xfs_bmap_free_t *flist, /* bmap's freeblock list */ + xfs_extlen_t total) /* bmap's total block count */ +{ + struct xfs_da_args *args; + int rval; + int v; /* type-checking value */ + + ASSERT(S_ISDIR(dp->i_d.di_mode)); + XFS_STATS_INC(xs_dir_remove); + + args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); + if (!args) + return -ENOMEM; + + args->geo = dp->i_mount->m_dir_geo; + args->name = name->name; + args->namelen = name->len; + args->filetype = name->type; + args->hashval = dp->i_mount->m_dirnameops->hashname(name); + args->inumber = ino; + args->dp = dp; + args->firstblock = first; + args->flist = flist; + args->total = total; + args->whichfork = XFS_DATA_FORK; + args->trans = tp; + + if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) { + rval = xfs_dir2_sf_removename(args); + goto out_free; + } + + rval = xfs_dir2_isblock(args, &v); + if (rval) + goto out_free; + if (v) { + rval = xfs_dir2_block_removename(args); + goto out_free; + } + + rval = xfs_dir2_isleaf(args, &v); + if (rval) + goto out_free; + if (v) + rval = xfs_dir2_leaf_removename(args); + else + rval = xfs_dir2_node_removename(args); +out_free: + kmem_free(args); + return rval; +} + +/* + * Replace the inode number of a directory entry. + */ +int +xfs_dir_replace( + xfs_trans_t *tp, + xfs_inode_t *dp, + struct xfs_name *name, /* name of entry to replace */ + xfs_ino_t inum, /* new inode number */ + xfs_fsblock_t *first, /* bmap's firstblock */ + xfs_bmap_free_t *flist, /* bmap's freeblock list */ + xfs_extlen_t total) /* bmap's total block count */ +{ + struct xfs_da_args *args; + int rval; + int v; /* type-checking value */ + + ASSERT(S_ISDIR(dp->i_d.di_mode)); + + rval = xfs_dir_ino_validate(tp->t_mountp, inum); + if (rval) + return rval; + + args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); + if (!args) + return -ENOMEM; + + args->geo = dp->i_mount->m_dir_geo; + args->name = name->name; + args->namelen = name->len; + args->filetype = name->type; + args->hashval = dp->i_mount->m_dirnameops->hashname(name); + args->inumber = inum; + args->dp = dp; + args->firstblock = first; + args->flist = flist; + args->total = total; + args->whichfork = XFS_DATA_FORK; + args->trans = tp; + + if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) { + rval = xfs_dir2_sf_replace(args); + goto out_free; + } + + rval = xfs_dir2_isblock(args, &v); + if (rval) + goto out_free; + if (v) { + rval = xfs_dir2_block_replace(args); + goto out_free; + } + + rval = xfs_dir2_isleaf(args, &v); + if (rval) + goto out_free; + if (v) + rval = xfs_dir2_leaf_replace(args); + else + rval = xfs_dir2_node_replace(args); +out_free: + kmem_free(args); + return rval; +} + +/* + * See if this entry can be added to the directory without allocating space. + */ +int +xfs_dir_canenter( + xfs_trans_t *tp, + xfs_inode_t *dp, + struct xfs_name *name) /* name of entry to add */ +{ + return xfs_dir_createname(tp, dp, name, 0, NULL, NULL, 0); +} + +/* + * Utility routines. + */ + +/* + * Add a block to the directory. + * + * This routine is for data and free blocks, not leaf/node blocks which are + * handled by xfs_da_grow_inode. + */ +int +xfs_dir2_grow_inode( + struct xfs_da_args *args, + int space, /* v2 dir's space XFS_DIR2_xxx_SPACE */ + xfs_dir2_db_t *dbp) /* out: block number added */ +{ + struct xfs_inode *dp = args->dp; + struct xfs_mount *mp = dp->i_mount; + xfs_fileoff_t bno; /* directory offset of new block */ + int count; /* count of filesystem blocks */ + int error; + + trace_xfs_dir2_grow_inode(args, space); + + /* + * Set lowest possible block in the space requested. + */ + bno = XFS_B_TO_FSBT(mp, space * XFS_DIR2_SPACE_SIZE); + count = args->geo->fsbcount; + + error = xfs_da_grow_inode_int(args, &bno, count); + if (error) + return error; + + *dbp = xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)bno); + + /* + * Update file's size if this is the data space and it grew. + */ + if (space == XFS_DIR2_DATA_SPACE) { + xfs_fsize_t size; /* directory file (data) size */ + + size = XFS_FSB_TO_B(mp, bno + count); + if (size > dp->i_d.di_size) { + dp->i_d.di_size = size; + xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE); + } + } + return 0; +} + +/* + * See if the directory is a single-block form directory. + */ +int +xfs_dir2_isblock( + struct xfs_da_args *args, + int *vp) /* out: 1 is block, 0 is not block */ +{ + xfs_fileoff_t last; /* last file offset */ + int rval; + + if ((rval = xfs_bmap_last_offset(args->dp, &last, XFS_DATA_FORK))) + return rval; + rval = XFS_FSB_TO_B(args->dp->i_mount, last) == args->geo->blksize; + ASSERT(rval == 0 || args->dp->i_d.di_size == args->geo->blksize); + *vp = rval; + return 0; +} + +/* + * See if the directory is a single-leaf form directory. + */ +int +xfs_dir2_isleaf( + struct xfs_da_args *args, + int *vp) /* out: 1 is block, 0 is not block */ +{ + xfs_fileoff_t last; /* last file offset */ + int rval; + + if ((rval = xfs_bmap_last_offset(args->dp, &last, XFS_DATA_FORK))) + return rval; + *vp = last == args->geo->leafblk + args->geo->fsbcount; + return 0; +} + +/* + * Remove the given block from the directory. + * This routine is used for data and free blocks, leaf/node are done + * by xfs_da_shrink_inode. + */ +int +xfs_dir2_shrink_inode( + xfs_da_args_t *args, + xfs_dir2_db_t db, + struct xfs_buf *bp) +{ + xfs_fileoff_t bno; /* directory file offset */ + xfs_dablk_t da; /* directory file offset */ + int done; /* bunmap is finished */ + xfs_inode_t *dp; + int error; + xfs_mount_t *mp; + xfs_trans_t *tp; + + trace_xfs_dir2_shrink_inode(args, db); + + dp = args->dp; + mp = dp->i_mount; + tp = args->trans; + da = xfs_dir2_db_to_da(args->geo, db); + /* + * Unmap the fsblock(s). + */ + if ((error = xfs_bunmapi(tp, dp, da, args->geo->fsbcount, + XFS_BMAPI_METADATA, 0, args->firstblock, args->flist, + &done))) { + /* + * ENOSPC actually can happen if we're in a removename with + * no space reservation, and the resulting block removal + * would cause a bmap btree split or conversion from extents + * to btree. This can only happen for un-fragmented + * directory blocks, since you need to be punching out + * the middle of an extent. + * In this case we need to leave the block in the file, + * and not binval it. + * So the block has to be in a consistent empty state + * and appropriately logged. + * We don't free up the buffer, the caller can tell it + * hasn't happened since it got an error back. + */ + return error; + } + ASSERT(done); + /* + * Invalidate the buffer from the transaction. + */ + xfs_trans_binval(tp, bp); + /* + * If it's not a data block, we're done. + */ + if (db >= xfs_dir2_byte_to_db(args->geo, XFS_DIR2_LEAF_OFFSET)) + return 0; + /* + * If the block isn't the last one in the directory, we're done. + */ + if (dp->i_d.di_size > xfs_dir2_db_off_to_byte(args->geo, db + 1, 0)) + return 0; + bno = da; + if ((error = xfs_bmap_last_before(tp, dp, &bno, XFS_DATA_FORK))) { + /* + * This can't really happen unless there's kernel corruption. + */ + return error; + } + if (db == args->geo->datablk) + ASSERT(bno == 0); + else + ASSERT(bno > 0); + /* + * Set the size to the new last block. + */ + dp->i_d.di_size = XFS_FSB_TO_B(mp, bno); + xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); + return 0; +} diff --git a/kernel/fs/xfs/libxfs/xfs_dir2.h b/kernel/fs/xfs/libxfs/xfs_dir2.h new file mode 100644 index 000000000..e55353651 --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_dir2.h @@ -0,0 +1,320 @@ +/* + * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_DIR2_H__ +#define __XFS_DIR2_H__ + +struct xfs_bmap_free; +struct xfs_da_args; +struct xfs_inode; +struct xfs_mount; +struct xfs_trans; +struct xfs_dir2_sf_hdr; +struct xfs_dir2_sf_entry; +struct xfs_dir2_data_hdr; +struct xfs_dir2_data_entry; +struct xfs_dir2_data_unused; + +extern struct xfs_name xfs_name_dotdot; + +/* + * directory filetype conversion tables. + */ +#define S_SHIFT 12 +extern const unsigned char xfs_mode_to_ftype[]; + +/* + * directory operations vector for encode/decode routines + */ +struct xfs_dir_ops { + int (*sf_entsize)(struct xfs_dir2_sf_hdr *hdr, int len); + struct xfs_dir2_sf_entry * + (*sf_nextentry)(struct xfs_dir2_sf_hdr *hdr, + struct xfs_dir2_sf_entry *sfep); + __uint8_t (*sf_get_ftype)(struct xfs_dir2_sf_entry *sfep); + void (*sf_put_ftype)(struct xfs_dir2_sf_entry *sfep, + __uint8_t ftype); + xfs_ino_t (*sf_get_ino)(struct xfs_dir2_sf_hdr *hdr, + struct xfs_dir2_sf_entry *sfep); + void (*sf_put_ino)(struct xfs_dir2_sf_hdr *hdr, + struct xfs_dir2_sf_entry *sfep, + xfs_ino_t ino); + xfs_ino_t (*sf_get_parent_ino)(struct xfs_dir2_sf_hdr *hdr); + void (*sf_put_parent_ino)(struct xfs_dir2_sf_hdr *hdr, + xfs_ino_t ino); + + int (*data_entsize)(int len); + __uint8_t (*data_get_ftype)(struct xfs_dir2_data_entry *dep); + void (*data_put_ftype)(struct xfs_dir2_data_entry *dep, + __uint8_t ftype); + __be16 * (*data_entry_tag_p)(struct xfs_dir2_data_entry *dep); + struct xfs_dir2_data_free * + (*data_bestfree_p)(struct xfs_dir2_data_hdr *hdr); + + xfs_dir2_data_aoff_t data_dot_offset; + xfs_dir2_data_aoff_t data_dotdot_offset; + xfs_dir2_data_aoff_t data_first_offset; + size_t data_entry_offset; + + struct xfs_dir2_data_entry * + (*data_dot_entry_p)(struct xfs_dir2_data_hdr *hdr); + struct xfs_dir2_data_entry * + (*data_dotdot_entry_p)(struct xfs_dir2_data_hdr *hdr); + struct xfs_dir2_data_entry * + (*data_first_entry_p)(struct xfs_dir2_data_hdr *hdr); + struct xfs_dir2_data_entry * + (*data_entry_p)(struct xfs_dir2_data_hdr *hdr); + struct xfs_dir2_data_unused * + (*data_unused_p)(struct xfs_dir2_data_hdr *hdr); + + int leaf_hdr_size; + void (*leaf_hdr_to_disk)(struct xfs_dir2_leaf *to, + struct xfs_dir3_icleaf_hdr *from); + void (*leaf_hdr_from_disk)(struct xfs_dir3_icleaf_hdr *to, + struct xfs_dir2_leaf *from); + int (*leaf_max_ents)(struct xfs_da_geometry *geo); + struct xfs_dir2_leaf_entry * + (*leaf_ents_p)(struct xfs_dir2_leaf *lp); + + int node_hdr_size; + void (*node_hdr_to_disk)(struct xfs_da_intnode *to, + struct xfs_da3_icnode_hdr *from); + void (*node_hdr_from_disk)(struct xfs_da3_icnode_hdr *to, + struct xfs_da_intnode *from); + struct xfs_da_node_entry * + (*node_tree_p)(struct xfs_da_intnode *dap); + + int free_hdr_size; + void (*free_hdr_to_disk)(struct xfs_dir2_free *to, + struct xfs_dir3_icfree_hdr *from); + void (*free_hdr_from_disk)(struct xfs_dir3_icfree_hdr *to, + struct xfs_dir2_free *from); + int (*free_max_bests)(struct xfs_da_geometry *geo); + __be16 * (*free_bests_p)(struct xfs_dir2_free *free); + xfs_dir2_db_t (*db_to_fdb)(struct xfs_da_geometry *geo, + xfs_dir2_db_t db); + int (*db_to_fdindex)(struct xfs_da_geometry *geo, + xfs_dir2_db_t db); +}; + +extern const struct xfs_dir_ops * + xfs_dir_get_ops(struct xfs_mount *mp, struct xfs_inode *dp); +extern const struct xfs_dir_ops * + xfs_nondir_get_ops(struct xfs_mount *mp, struct xfs_inode *dp); + +/* + * Generic directory interface routines + */ +extern void xfs_dir_startup(void); +extern int xfs_da_mount(struct xfs_mount *mp); +extern void xfs_da_unmount(struct xfs_mount *mp); + +extern int xfs_dir_isempty(struct xfs_inode *dp); +extern int xfs_dir_init(struct xfs_trans *tp, struct xfs_inode *dp, + struct xfs_inode *pdp); +extern int xfs_dir_createname(struct xfs_trans *tp, struct xfs_inode *dp, + struct xfs_name *name, xfs_ino_t inum, + xfs_fsblock_t *first, + struct xfs_bmap_free *flist, xfs_extlen_t tot); +extern int xfs_dir_lookup(struct xfs_trans *tp, struct xfs_inode *dp, + struct xfs_name *name, xfs_ino_t *inum, + struct xfs_name *ci_name); +extern int xfs_dir_removename(struct xfs_trans *tp, struct xfs_inode *dp, + struct xfs_name *name, xfs_ino_t ino, + xfs_fsblock_t *first, + struct xfs_bmap_free *flist, xfs_extlen_t tot); +extern int xfs_dir_replace(struct xfs_trans *tp, struct xfs_inode *dp, + struct xfs_name *name, xfs_ino_t inum, + xfs_fsblock_t *first, + struct xfs_bmap_free *flist, xfs_extlen_t tot); +extern int xfs_dir_canenter(struct xfs_trans *tp, struct xfs_inode *dp, + struct xfs_name *name); + +/* + * Direct call from the bmap code, bypassing the generic directory layer. + */ +extern int xfs_dir2_sf_to_block(struct xfs_da_args *args); + +/* + * Interface routines used by userspace utilities + */ +extern int xfs_dir2_isblock(struct xfs_da_args *args, int *r); +extern int xfs_dir2_isleaf(struct xfs_da_args *args, int *r); +extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db, + struct xfs_buf *bp); + +extern void xfs_dir2_data_freescan(struct xfs_inode *dp, + struct xfs_dir2_data_hdr *hdr, int *loghead); +extern void xfs_dir2_data_log_entry(struct xfs_da_args *args, + struct xfs_buf *bp, struct xfs_dir2_data_entry *dep); +extern void xfs_dir2_data_log_header(struct xfs_da_args *args, + struct xfs_buf *bp); +extern void xfs_dir2_data_log_unused(struct xfs_da_args *args, + struct xfs_buf *bp, struct xfs_dir2_data_unused *dup); +extern void xfs_dir2_data_make_free(struct xfs_da_args *args, + struct xfs_buf *bp, xfs_dir2_data_aoff_t offset, + xfs_dir2_data_aoff_t len, int *needlogp, int *needscanp); +extern void xfs_dir2_data_use_free(struct xfs_da_args *args, + struct xfs_buf *bp, struct xfs_dir2_data_unused *dup, + xfs_dir2_data_aoff_t offset, xfs_dir2_data_aoff_t len, + int *needlogp, int *needscanp); + +extern struct xfs_dir2_data_free *xfs_dir2_data_freefind( + struct xfs_dir2_data_hdr *hdr, struct xfs_dir2_data_free *bf, + struct xfs_dir2_data_unused *dup); + +extern const struct xfs_buf_ops xfs_dir3_block_buf_ops; +extern const struct xfs_buf_ops xfs_dir3_leafn_buf_ops; +extern const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops; +extern const struct xfs_buf_ops xfs_dir3_free_buf_ops; +extern const struct xfs_buf_ops xfs_dir3_data_buf_ops; + +/* + * Directory offset/block conversion functions. + * + * DB blocks here are logical directory block numbers, not filesystem blocks. + */ + +/* + * Convert dataptr to byte in file space + */ +static inline xfs_dir2_off_t +xfs_dir2_dataptr_to_byte(xfs_dir2_dataptr_t dp) +{ + return (xfs_dir2_off_t)dp << XFS_DIR2_DATA_ALIGN_LOG; +} + +/* + * Convert byte in file space to dataptr. It had better be aligned. + */ +static inline xfs_dir2_dataptr_t +xfs_dir2_byte_to_dataptr(xfs_dir2_off_t by) +{ + return (xfs_dir2_dataptr_t)(by >> XFS_DIR2_DATA_ALIGN_LOG); +} + +/* + * Convert byte in space to (DB) block + */ +static inline xfs_dir2_db_t +xfs_dir2_byte_to_db(struct xfs_da_geometry *geo, xfs_dir2_off_t by) +{ + return (xfs_dir2_db_t)(by >> geo->blklog); +} + +/* + * Convert dataptr to a block number + */ +static inline xfs_dir2_db_t +xfs_dir2_dataptr_to_db(struct xfs_da_geometry *geo, xfs_dir2_dataptr_t dp) +{ + return xfs_dir2_byte_to_db(geo, xfs_dir2_dataptr_to_byte(dp)); +} + +/* + * Convert byte in space to offset in a block + */ +static inline xfs_dir2_data_aoff_t +xfs_dir2_byte_to_off(struct xfs_da_geometry *geo, xfs_dir2_off_t by) +{ + return (xfs_dir2_data_aoff_t)(by & (geo->blksize - 1)); +} + +/* + * Convert dataptr to a byte offset in a block + */ +static inline xfs_dir2_data_aoff_t +xfs_dir2_dataptr_to_off(struct xfs_da_geometry *geo, xfs_dir2_dataptr_t dp) +{ + return xfs_dir2_byte_to_off(geo, xfs_dir2_dataptr_to_byte(dp)); +} + +/* + * Convert block and offset to byte in space + */ +static inline xfs_dir2_off_t +xfs_dir2_db_off_to_byte(struct xfs_da_geometry *geo, xfs_dir2_db_t db, + xfs_dir2_data_aoff_t o) +{ + return ((xfs_dir2_off_t)db << geo->blklog) + o; +} + +/* + * Convert block (DB) to block (dablk) + */ +static inline xfs_dablk_t +xfs_dir2_db_to_da(struct xfs_da_geometry *geo, xfs_dir2_db_t db) +{ + return (xfs_dablk_t)(db << (geo->blklog - geo->fsblog)); +} + +/* + * Convert byte in space to (DA) block + */ +static inline xfs_dablk_t +xfs_dir2_byte_to_da(struct xfs_da_geometry *geo, xfs_dir2_off_t by) +{ + return xfs_dir2_db_to_da(geo, xfs_dir2_byte_to_db(geo, by)); +} + +/* + * Convert block and offset to dataptr + */ +static inline xfs_dir2_dataptr_t +xfs_dir2_db_off_to_dataptr(struct xfs_da_geometry *geo, xfs_dir2_db_t db, + xfs_dir2_data_aoff_t o) +{ + return xfs_dir2_byte_to_dataptr(xfs_dir2_db_off_to_byte(geo, db, o)); +} + +/* + * Convert block (dablk) to block (DB) + */ +static inline xfs_dir2_db_t +xfs_dir2_da_to_db(struct xfs_da_geometry *geo, xfs_dablk_t da) +{ + return (xfs_dir2_db_t)(da >> (geo->blklog - geo->fsblog)); +} + +/* + * Convert block (dablk) to byte offset in space + */ +static inline xfs_dir2_off_t +xfs_dir2_da_to_byte(struct xfs_da_geometry *geo, xfs_dablk_t da) +{ + return xfs_dir2_db_off_to_byte(geo, xfs_dir2_da_to_db(geo, da), 0); +} + +/* + * Directory tail pointer accessor functions. Based on block geometry. + */ +static inline struct xfs_dir2_block_tail * +xfs_dir2_block_tail_p(struct xfs_da_geometry *geo, struct xfs_dir2_data_hdr *hdr) +{ + return ((struct xfs_dir2_block_tail *) + ((char *)hdr + geo->blksize)) - 1; +} + +static inline struct xfs_dir2_leaf_tail * +xfs_dir2_leaf_tail_p(struct xfs_da_geometry *geo, struct xfs_dir2_leaf *lp) +{ + return (struct xfs_dir2_leaf_tail *) + ((char *)lp + geo->blksize - + sizeof(struct xfs_dir2_leaf_tail)); +} + +#endif /* __XFS_DIR2_H__ */ diff --git a/kernel/fs/xfs/libxfs/xfs_dir2_block.c b/kernel/fs/xfs/libxfs/xfs_dir2_block.c new file mode 100644 index 000000000..9354e190b --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_dir2_block.c @@ -0,0 +1,1254 @@ +/* + * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_inode_item.h" +#include "xfs_bmap.h" +#include "xfs_buf_item.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" +#include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_cksum.h" + +/* + * Local function prototypes. + */ +static void xfs_dir2_block_log_leaf(xfs_trans_t *tp, struct xfs_buf *bp, + int first, int last); +static void xfs_dir2_block_log_tail(xfs_trans_t *tp, struct xfs_buf *bp); +static int xfs_dir2_block_lookup_int(xfs_da_args_t *args, struct xfs_buf **bpp, + int *entno); +static int xfs_dir2_block_sort(const void *a, const void *b); + +static xfs_dahash_t xfs_dir_hash_dot, xfs_dir_hash_dotdot; + +/* + * One-time startup routine called from xfs_init(). + */ +void +xfs_dir_startup(void) +{ + xfs_dir_hash_dot = xfs_da_hashname((unsigned char *)".", 1); + xfs_dir_hash_dotdot = xfs_da_hashname((unsigned char *)"..", 2); +} + +static bool +xfs_dir3_block_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (hdr3->magic != cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) + return false; + if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_uuid)) + return false; + if (be64_to_cpu(hdr3->blkno) != bp->b_bn) + return false; + } else { + if (hdr3->magic != cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) + return false; + } + if (__xfs_dir3_data_check(NULL, bp)) + return false; + return true; +} + +static void +xfs_dir3_block_read_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + + if (xfs_sb_version_hascrc(&mp->m_sb) && + !xfs_buf_verify_cksum(bp, XFS_DIR3_DATA_CRC_OFF)) + xfs_buf_ioerror(bp, -EFSBADCRC); + else if (!xfs_dir3_block_verify(bp)) + xfs_buf_ioerror(bp, -EFSCORRUPTED); + + if (bp->b_error) + xfs_verifier_error(bp); +} + +static void +xfs_dir3_block_write_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; + struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; + + if (!xfs_dir3_block_verify(bp)) { + xfs_buf_ioerror(bp, -EFSCORRUPTED); + xfs_verifier_error(bp); + return; + } + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (bip) + hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn); + + xfs_buf_update_cksum(bp, XFS_DIR3_DATA_CRC_OFF); +} + +const struct xfs_buf_ops xfs_dir3_block_buf_ops = { + .verify_read = xfs_dir3_block_read_verify, + .verify_write = xfs_dir3_block_write_verify, +}; + +int +xfs_dir3_block_read( + struct xfs_trans *tp, + struct xfs_inode *dp, + struct xfs_buf **bpp) +{ + struct xfs_mount *mp = dp->i_mount; + int err; + + err = xfs_da_read_buf(tp, dp, mp->m_dir_geo->datablk, -1, bpp, + XFS_DATA_FORK, &xfs_dir3_block_buf_ops); + if (!err && tp) + xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_BLOCK_BUF); + return err; +} + +static void +xfs_dir3_block_init( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_buf *bp, + struct xfs_inode *dp) +{ + struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; + + bp->b_ops = &xfs_dir3_block_buf_ops; + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_BLOCK_BUF); + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + memset(hdr3, 0, sizeof(*hdr3)); + hdr3->magic = cpu_to_be32(XFS_DIR3_BLOCK_MAGIC); + hdr3->blkno = cpu_to_be64(bp->b_bn); + hdr3->owner = cpu_to_be64(dp->i_ino); + uuid_copy(&hdr3->uuid, &mp->m_sb.sb_uuid); + return; + + } + hdr3->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC); +} + +static void +xfs_dir2_block_need_space( + struct xfs_inode *dp, + struct xfs_dir2_data_hdr *hdr, + struct xfs_dir2_block_tail *btp, + struct xfs_dir2_leaf_entry *blp, + __be16 **tagpp, + struct xfs_dir2_data_unused **dupp, + struct xfs_dir2_data_unused **enddupp, + int *compact, + int len) +{ + struct xfs_dir2_data_free *bf; + __be16 *tagp = NULL; + struct xfs_dir2_data_unused *dup = NULL; + struct xfs_dir2_data_unused *enddup = NULL; + + *compact = 0; + bf = dp->d_ops->data_bestfree_p(hdr); + + /* + * If there are stale entries we'll use one for the leaf. + */ + if (btp->stale) { + if (be16_to_cpu(bf[0].length) >= len) { + /* + * The biggest entry enough to avoid compaction. + */ + dup = (xfs_dir2_data_unused_t *) + ((char *)hdr + be16_to_cpu(bf[0].offset)); + goto out; + } + + /* + * Will need to compact to make this work. + * Tag just before the first leaf entry. + */ + *compact = 1; + tagp = (__be16 *)blp - 1; + + /* Data object just before the first leaf entry. */ + dup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp)); + + /* + * If it's not free then the data will go where the + * leaf data starts now, if it works at all. + */ + if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { + if (be16_to_cpu(dup->length) + (be32_to_cpu(btp->stale) - 1) * + (uint)sizeof(*blp) < len) + dup = NULL; + } else if ((be32_to_cpu(btp->stale) - 1) * (uint)sizeof(*blp) < len) + dup = NULL; + else + dup = (xfs_dir2_data_unused_t *)blp; + goto out; + } + + /* + * no stale entries, so just use free space. + * Tag just before the first leaf entry. + */ + tagp = (__be16 *)blp - 1; + + /* Data object just before the first leaf entry. */ + enddup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp)); + + /* + * If it's not free then can't do this add without cleaning up: + * the space before the first leaf entry needs to be free so it + * can be expanded to hold the pointer to the new entry. + */ + if (be16_to_cpu(enddup->freetag) == XFS_DIR2_DATA_FREE_TAG) { + /* + * Check out the biggest freespace and see if it's the same one. + */ + dup = (xfs_dir2_data_unused_t *) + ((char *)hdr + be16_to_cpu(bf[0].offset)); + if (dup != enddup) { + /* + * Not the same free entry, just check its length. + */ + if (be16_to_cpu(dup->length) < len) + dup = NULL; + goto out; + } + + /* + * It is the biggest freespace, can it hold the leaf too? + */ + if (be16_to_cpu(dup->length) < len + (uint)sizeof(*blp)) { + /* + * Yes, use the second-largest entry instead if it works. + */ + if (be16_to_cpu(bf[1].length) >= len) + dup = (xfs_dir2_data_unused_t *) + ((char *)hdr + be16_to_cpu(bf[1].offset)); + else + dup = NULL; + } + } +out: + *tagpp = tagp; + *dupp = dup; + *enddupp = enddup; +} + +/* + * compact the leaf entries. + * Leave the highest-numbered stale entry stale. + * XXX should be the one closest to mid but mid is not yet computed. + */ +static void +xfs_dir2_block_compact( + struct xfs_da_args *args, + struct xfs_buf *bp, + struct xfs_dir2_data_hdr *hdr, + struct xfs_dir2_block_tail *btp, + struct xfs_dir2_leaf_entry *blp, + int *needlog, + int *lfloghigh, + int *lfloglow) +{ + int fromidx; /* source leaf index */ + int toidx; /* target leaf index */ + int needscan = 0; + int highstale; /* high stale index */ + + fromidx = toidx = be32_to_cpu(btp->count) - 1; + highstale = *lfloghigh = -1; + for (; fromidx >= 0; fromidx--) { + if (blp[fromidx].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) { + if (highstale == -1) + highstale = toidx; + else { + if (*lfloghigh == -1) + *lfloghigh = toidx; + continue; + } + } + if (fromidx < toidx) + blp[toidx] = blp[fromidx]; + toidx--; + } + *lfloglow = toidx + 1 - (be32_to_cpu(btp->stale) - 1); + *lfloghigh -= be32_to_cpu(btp->stale) - 1; + be32_add_cpu(&btp->count, -(be32_to_cpu(btp->stale) - 1)); + xfs_dir2_data_make_free(args, bp, + (xfs_dir2_data_aoff_t)((char *)blp - (char *)hdr), + (xfs_dir2_data_aoff_t)((be32_to_cpu(btp->stale) - 1) * sizeof(*blp)), + needlog, &needscan); + btp->stale = cpu_to_be32(1); + /* + * If we now need to rebuild the bestfree map, do so. + * This needs to happen before the next call to use_free. + */ + if (needscan) + xfs_dir2_data_freescan(args->dp, hdr, needlog); +} + +/* + * Add an entry to a block directory. + */ +int /* error */ +xfs_dir2_block_addname( + xfs_da_args_t *args) /* directory op arguments */ +{ + xfs_dir2_data_hdr_t *hdr; /* block header */ + xfs_dir2_leaf_entry_t *blp; /* block leaf entries */ + struct xfs_buf *bp; /* buffer for block */ + xfs_dir2_block_tail_t *btp; /* block tail */ + int compact; /* need to compact leaf ents */ + xfs_dir2_data_entry_t *dep; /* block data entry */ + xfs_inode_t *dp; /* directory inode */ + xfs_dir2_data_unused_t *dup; /* block unused entry */ + int error; /* error return value */ + xfs_dir2_data_unused_t *enddup=NULL; /* unused at end of data */ + xfs_dahash_t hash; /* hash value of found entry */ + int high; /* high index for binary srch */ + int highstale; /* high stale index */ + int lfloghigh=0; /* last final leaf to log */ + int lfloglow=0; /* first final leaf to log */ + int len; /* length of the new entry */ + int low; /* low index for binary srch */ + int lowstale; /* low stale index */ + int mid=0; /* midpoint for binary srch */ + int needlog; /* need to log header */ + int needscan; /* need to rescan freespace */ + __be16 *tagp; /* pointer to tag value */ + xfs_trans_t *tp; /* transaction structure */ + + trace_xfs_dir2_block_addname(args); + + dp = args->dp; + tp = args->trans; + + /* Read the (one and only) directory block into bp. */ + error = xfs_dir3_block_read(tp, dp, &bp); + if (error) + return error; + + len = dp->d_ops->data_entsize(args->namelen); + + /* + * Set up pointers to parts of the block. + */ + hdr = bp->b_addr; + btp = xfs_dir2_block_tail_p(args->geo, hdr); + blp = xfs_dir2_block_leaf_p(btp); + + /* + * Find out if we can reuse stale entries or whether we need extra + * space for entry and new leaf. + */ + xfs_dir2_block_need_space(dp, hdr, btp, blp, &tagp, &dup, + &enddup, &compact, len); + + /* + * Done everything we need for a space check now. + */ + if (args->op_flags & XFS_DA_OP_JUSTCHECK) { + xfs_trans_brelse(tp, bp); + if (!dup) + return -ENOSPC; + return 0; + } + + /* + * If we don't have space for the new entry & leaf ... + */ + if (!dup) { + /* Don't have a space reservation: return no-space. */ + if (args->total == 0) + return -ENOSPC; + /* + * Convert to the next larger format. + * Then add the new entry in that format. + */ + error = xfs_dir2_block_to_leaf(args, bp); + if (error) + return error; + return xfs_dir2_leaf_addname(args); + } + + needlog = needscan = 0; + + /* + * If need to compact the leaf entries, do it now. + */ + if (compact) { + xfs_dir2_block_compact(args, bp, hdr, btp, blp, &needlog, + &lfloghigh, &lfloglow); + /* recalculate blp post-compaction */ + blp = xfs_dir2_block_leaf_p(btp); + } else if (btp->stale) { + /* + * Set leaf logging boundaries to impossible state. + * For the no-stale case they're set explicitly. + */ + lfloglow = be32_to_cpu(btp->count); + lfloghigh = -1; + } + + /* + * Find the slot that's first lower than our hash value, -1 if none. + */ + for (low = 0, high = be32_to_cpu(btp->count) - 1; low <= high; ) { + mid = (low + high) >> 1; + if ((hash = be32_to_cpu(blp[mid].hashval)) == args->hashval) + break; + if (hash < args->hashval) + low = mid + 1; + else + high = mid - 1; + } + while (mid >= 0 && be32_to_cpu(blp[mid].hashval) >= args->hashval) { + mid--; + } + /* + * No stale entries, will use enddup space to hold new leaf. + */ + if (!btp->stale) { + /* + * Mark the space needed for the new leaf entry, now in use. + */ + xfs_dir2_data_use_free(args, bp, enddup, + (xfs_dir2_data_aoff_t) + ((char *)enddup - (char *)hdr + be16_to_cpu(enddup->length) - + sizeof(*blp)), + (xfs_dir2_data_aoff_t)sizeof(*blp), + &needlog, &needscan); + /* + * Update the tail (entry count). + */ + be32_add_cpu(&btp->count, 1); + /* + * If we now need to rebuild the bestfree map, do so. + * This needs to happen before the next call to use_free. + */ + if (needscan) { + xfs_dir2_data_freescan(dp, hdr, &needlog); + needscan = 0; + } + /* + * Adjust pointer to the first leaf entry, we're about to move + * the table up one to open up space for the new leaf entry. + * Then adjust our index to match. + */ + blp--; + mid++; + if (mid) + memmove(blp, &blp[1], mid * sizeof(*blp)); + lfloglow = 0; + lfloghigh = mid; + } + /* + * Use a stale leaf for our new entry. + */ + else { + for (lowstale = mid; + lowstale >= 0 && + blp[lowstale].address != + cpu_to_be32(XFS_DIR2_NULL_DATAPTR); + lowstale--) + continue; + for (highstale = mid + 1; + highstale < be32_to_cpu(btp->count) && + blp[highstale].address != + cpu_to_be32(XFS_DIR2_NULL_DATAPTR) && + (lowstale < 0 || mid - lowstale > highstale - mid); + highstale++) + continue; + /* + * Move entries toward the low-numbered stale entry. + */ + if (lowstale >= 0 && + (highstale == be32_to_cpu(btp->count) || + mid - lowstale <= highstale - mid)) { + if (mid - lowstale) + memmove(&blp[lowstale], &blp[lowstale + 1], + (mid - lowstale) * sizeof(*blp)); + lfloglow = MIN(lowstale, lfloglow); + lfloghigh = MAX(mid, lfloghigh); + } + /* + * Move entries toward the high-numbered stale entry. + */ + else { + ASSERT(highstale < be32_to_cpu(btp->count)); + mid++; + if (highstale - mid) + memmove(&blp[mid + 1], &blp[mid], + (highstale - mid) * sizeof(*blp)); + lfloglow = MIN(mid, lfloglow); + lfloghigh = MAX(highstale, lfloghigh); + } + be32_add_cpu(&btp->stale, -1); + } + /* + * Point to the new data entry. + */ + dep = (xfs_dir2_data_entry_t *)dup; + /* + * Fill in the leaf entry. + */ + blp[mid].hashval = cpu_to_be32(args->hashval); + blp[mid].address = cpu_to_be32(xfs_dir2_byte_to_dataptr( + (char *)dep - (char *)hdr)); + xfs_dir2_block_log_leaf(tp, bp, lfloglow, lfloghigh); + /* + * Mark space for the data entry used. + */ + xfs_dir2_data_use_free(args, bp, dup, + (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr), + (xfs_dir2_data_aoff_t)len, &needlog, &needscan); + /* + * Create the new data entry. + */ + dep->inumber = cpu_to_be64(args->inumber); + dep->namelen = args->namelen; + memcpy(dep->name, args->name, args->namelen); + dp->d_ops->data_put_ftype(dep, args->filetype); + tagp = dp->d_ops->data_entry_tag_p(dep); + *tagp = cpu_to_be16((char *)dep - (char *)hdr); + /* + * Clean up the bestfree array and log the header, tail, and entry. + */ + if (needscan) + xfs_dir2_data_freescan(dp, hdr, &needlog); + if (needlog) + xfs_dir2_data_log_header(args, bp); + xfs_dir2_block_log_tail(tp, bp); + xfs_dir2_data_log_entry(args, bp, dep); + xfs_dir3_data_check(dp, bp); + return 0; +} + +/* + * Log leaf entries from the block. + */ +static void +xfs_dir2_block_log_leaf( + xfs_trans_t *tp, /* transaction structure */ + struct xfs_buf *bp, /* block buffer */ + int first, /* index of first logged leaf */ + int last) /* index of last logged leaf */ +{ + xfs_dir2_data_hdr_t *hdr = bp->b_addr; + xfs_dir2_leaf_entry_t *blp; + xfs_dir2_block_tail_t *btp; + + btp = xfs_dir2_block_tail_p(tp->t_mountp->m_dir_geo, hdr); + blp = xfs_dir2_block_leaf_p(btp); + xfs_trans_log_buf(tp, bp, (uint)((char *)&blp[first] - (char *)hdr), + (uint)((char *)&blp[last + 1] - (char *)hdr - 1)); +} + +/* + * Log the block tail. + */ +static void +xfs_dir2_block_log_tail( + xfs_trans_t *tp, /* transaction structure */ + struct xfs_buf *bp) /* block buffer */ +{ + xfs_dir2_data_hdr_t *hdr = bp->b_addr; + xfs_dir2_block_tail_t *btp; + + btp = xfs_dir2_block_tail_p(tp->t_mountp->m_dir_geo, hdr); + xfs_trans_log_buf(tp, bp, (uint)((char *)btp - (char *)hdr), + (uint)((char *)(btp + 1) - (char *)hdr - 1)); +} + +/* + * Look up an entry in the block. This is the external routine, + * xfs_dir2_block_lookup_int does the real work. + */ +int /* error */ +xfs_dir2_block_lookup( + xfs_da_args_t *args) /* dir lookup arguments */ +{ + xfs_dir2_data_hdr_t *hdr; /* block header */ + xfs_dir2_leaf_entry_t *blp; /* block leaf entries */ + struct xfs_buf *bp; /* block buffer */ + xfs_dir2_block_tail_t *btp; /* block tail */ + xfs_dir2_data_entry_t *dep; /* block data entry */ + xfs_inode_t *dp; /* incore inode */ + int ent; /* entry index */ + int error; /* error return value */ + + trace_xfs_dir2_block_lookup(args); + + /* + * Get the buffer, look up the entry. + * If not found (ENOENT) then return, have no buffer. + */ + if ((error = xfs_dir2_block_lookup_int(args, &bp, &ent))) + return error; + dp = args->dp; + hdr = bp->b_addr; + xfs_dir3_data_check(dp, bp); + btp = xfs_dir2_block_tail_p(args->geo, hdr); + blp = xfs_dir2_block_leaf_p(btp); + /* + * Get the offset from the leaf entry, to point to the data. + */ + dep = (xfs_dir2_data_entry_t *)((char *)hdr + + xfs_dir2_dataptr_to_off(args->geo, + be32_to_cpu(blp[ent].address))); + /* + * Fill in inode number, CI name if appropriate, release the block. + */ + args->inumber = be64_to_cpu(dep->inumber); + args->filetype = dp->d_ops->data_get_ftype(dep); + error = xfs_dir_cilookup_result(args, dep->name, dep->namelen); + xfs_trans_brelse(args->trans, bp); + return error; +} + +/* + * Internal block lookup routine. + */ +static int /* error */ +xfs_dir2_block_lookup_int( + xfs_da_args_t *args, /* dir lookup arguments */ + struct xfs_buf **bpp, /* returned block buffer */ + int *entno) /* returned entry number */ +{ + xfs_dir2_dataptr_t addr; /* data entry address */ + xfs_dir2_data_hdr_t *hdr; /* block header */ + xfs_dir2_leaf_entry_t *blp; /* block leaf entries */ + struct xfs_buf *bp; /* block buffer */ + xfs_dir2_block_tail_t *btp; /* block tail */ + xfs_dir2_data_entry_t *dep; /* block data entry */ + xfs_inode_t *dp; /* incore inode */ + int error; /* error return value */ + xfs_dahash_t hash; /* found hash value */ + int high; /* binary search high index */ + int low; /* binary search low index */ + int mid; /* binary search current idx */ + xfs_mount_t *mp; /* filesystem mount point */ + xfs_trans_t *tp; /* transaction pointer */ + enum xfs_dacmp cmp; /* comparison result */ + + dp = args->dp; + tp = args->trans; + mp = dp->i_mount; + + error = xfs_dir3_block_read(tp, dp, &bp); + if (error) + return error; + + hdr = bp->b_addr; + xfs_dir3_data_check(dp, bp); + btp = xfs_dir2_block_tail_p(args->geo, hdr); + blp = xfs_dir2_block_leaf_p(btp); + /* + * Loop doing a binary search for our hash value. + * Find our entry, ENOENT if it's not there. + */ + for (low = 0, high = be32_to_cpu(btp->count) - 1; ; ) { + ASSERT(low <= high); + mid = (low + high) >> 1; + if ((hash = be32_to_cpu(blp[mid].hashval)) == args->hashval) + break; + if (hash < args->hashval) + low = mid + 1; + else + high = mid - 1; + if (low > high) { + ASSERT(args->op_flags & XFS_DA_OP_OKNOENT); + xfs_trans_brelse(tp, bp); + return -ENOENT; + } + } + /* + * Back up to the first one with the right hash value. + */ + while (mid > 0 && be32_to_cpu(blp[mid - 1].hashval) == args->hashval) { + mid--; + } + /* + * Now loop forward through all the entries with the + * right hash value looking for our name. + */ + do { + if ((addr = be32_to_cpu(blp[mid].address)) == XFS_DIR2_NULL_DATAPTR) + continue; + /* + * Get pointer to the entry from the leaf. + */ + dep = (xfs_dir2_data_entry_t *) + ((char *)hdr + xfs_dir2_dataptr_to_off(args->geo, addr)); + /* + * Compare name and if it's an exact match, return the index + * and buffer. If it's the first case-insensitive match, store + * the index and buffer and continue looking for an exact match. + */ + cmp = mp->m_dirnameops->compname(args, dep->name, dep->namelen); + if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) { + args->cmpresult = cmp; + *bpp = bp; + *entno = mid; + if (cmp == XFS_CMP_EXACT) + return 0; + } + } while (++mid < be32_to_cpu(btp->count) && + be32_to_cpu(blp[mid].hashval) == hash); + + ASSERT(args->op_flags & XFS_DA_OP_OKNOENT); + /* + * Here, we can only be doing a lookup (not a rename or replace). + * If a case-insensitive match was found earlier, return success. + */ + if (args->cmpresult == XFS_CMP_CASE) + return 0; + /* + * No match, release the buffer and return ENOENT. + */ + xfs_trans_brelse(tp, bp); + return -ENOENT; +} + +/* + * Remove an entry from a block format directory. + * If that makes the block small enough to fit in shortform, transform it. + */ +int /* error */ +xfs_dir2_block_removename( + xfs_da_args_t *args) /* directory operation args */ +{ + xfs_dir2_data_hdr_t *hdr; /* block header */ + xfs_dir2_leaf_entry_t *blp; /* block leaf pointer */ + struct xfs_buf *bp; /* block buffer */ + xfs_dir2_block_tail_t *btp; /* block tail */ + xfs_dir2_data_entry_t *dep; /* block data entry */ + xfs_inode_t *dp; /* incore inode */ + int ent; /* block leaf entry index */ + int error; /* error return value */ + int needlog; /* need to log block header */ + int needscan; /* need to fixup bestfree */ + xfs_dir2_sf_hdr_t sfh; /* shortform header */ + int size; /* shortform size */ + xfs_trans_t *tp; /* transaction pointer */ + + trace_xfs_dir2_block_removename(args); + + /* + * Look up the entry in the block. Gets the buffer and entry index. + * It will always be there, the vnodeops level does a lookup first. + */ + if ((error = xfs_dir2_block_lookup_int(args, &bp, &ent))) { + return error; + } + dp = args->dp; + tp = args->trans; + hdr = bp->b_addr; + btp = xfs_dir2_block_tail_p(args->geo, hdr); + blp = xfs_dir2_block_leaf_p(btp); + /* + * Point to the data entry using the leaf entry. + */ + dep = (xfs_dir2_data_entry_t *)((char *)hdr + + xfs_dir2_dataptr_to_off(args->geo, + be32_to_cpu(blp[ent].address))); + /* + * Mark the data entry's space free. + */ + needlog = needscan = 0; + xfs_dir2_data_make_free(args, bp, + (xfs_dir2_data_aoff_t)((char *)dep - (char *)hdr), + dp->d_ops->data_entsize(dep->namelen), &needlog, &needscan); + /* + * Fix up the block tail. + */ + be32_add_cpu(&btp->stale, 1); + xfs_dir2_block_log_tail(tp, bp); + /* + * Remove the leaf entry by marking it stale. + */ + blp[ent].address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR); + xfs_dir2_block_log_leaf(tp, bp, ent, ent); + /* + * Fix up bestfree, log the header if necessary. + */ + if (needscan) + xfs_dir2_data_freescan(dp, hdr, &needlog); + if (needlog) + xfs_dir2_data_log_header(args, bp); + xfs_dir3_data_check(dp, bp); + /* + * See if the size as a shortform is good enough. + */ + size = xfs_dir2_block_sfsize(dp, hdr, &sfh); + if (size > XFS_IFORK_DSIZE(dp)) + return 0; + + /* + * If it works, do the conversion. + */ + return xfs_dir2_block_to_sf(args, bp, size, &sfh); +} + +/* + * Replace an entry in a V2 block directory. + * Change the inode number to the new value. + */ +int /* error */ +xfs_dir2_block_replace( + xfs_da_args_t *args) /* directory operation args */ +{ + xfs_dir2_data_hdr_t *hdr; /* block header */ + xfs_dir2_leaf_entry_t *blp; /* block leaf entries */ + struct xfs_buf *bp; /* block buffer */ + xfs_dir2_block_tail_t *btp; /* block tail */ + xfs_dir2_data_entry_t *dep; /* block data entry */ + xfs_inode_t *dp; /* incore inode */ + int ent; /* leaf entry index */ + int error; /* error return value */ + + trace_xfs_dir2_block_replace(args); + + /* + * Lookup the entry in the directory. Get buffer and entry index. + * This will always succeed since the caller has already done a lookup. + */ + if ((error = xfs_dir2_block_lookup_int(args, &bp, &ent))) { + return error; + } + dp = args->dp; + hdr = bp->b_addr; + btp = xfs_dir2_block_tail_p(args->geo, hdr); + blp = xfs_dir2_block_leaf_p(btp); + /* + * Point to the data entry we need to change. + */ + dep = (xfs_dir2_data_entry_t *)((char *)hdr + + xfs_dir2_dataptr_to_off(args->geo, + be32_to_cpu(blp[ent].address))); + ASSERT(be64_to_cpu(dep->inumber) != args->inumber); + /* + * Change the inode number to the new value. + */ + dep->inumber = cpu_to_be64(args->inumber); + dp->d_ops->data_put_ftype(dep, args->filetype); + xfs_dir2_data_log_entry(args, bp, dep); + xfs_dir3_data_check(dp, bp); + return 0; +} + +/* + * Qsort comparison routine for the block leaf entries. + */ +static int /* sort order */ +xfs_dir2_block_sort( + const void *a, /* first leaf entry */ + const void *b) /* second leaf entry */ +{ + const xfs_dir2_leaf_entry_t *la; /* first leaf entry */ + const xfs_dir2_leaf_entry_t *lb; /* second leaf entry */ + + la = a; + lb = b; + return be32_to_cpu(la->hashval) < be32_to_cpu(lb->hashval) ? -1 : + (be32_to_cpu(la->hashval) > be32_to_cpu(lb->hashval) ? 1 : 0); +} + +/* + * Convert a V2 leaf directory to a V2 block directory if possible. + */ +int /* error */ +xfs_dir2_leaf_to_block( + xfs_da_args_t *args, /* operation arguments */ + struct xfs_buf *lbp, /* leaf buffer */ + struct xfs_buf *dbp) /* data buffer */ +{ + __be16 *bestsp; /* leaf bests table */ + xfs_dir2_data_hdr_t *hdr; /* block header */ + xfs_dir2_block_tail_t *btp; /* block tail */ + xfs_inode_t *dp; /* incore directory inode */ + xfs_dir2_data_unused_t *dup; /* unused data entry */ + int error; /* error return value */ + int from; /* leaf from index */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + xfs_dir2_leaf_entry_t *lep; /* leaf entry */ + xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */ + xfs_mount_t *mp; /* file system mount point */ + int needlog; /* need to log data header */ + int needscan; /* need to scan for bestfree */ + xfs_dir2_sf_hdr_t sfh; /* shortform header */ + int size; /* bytes used */ + __be16 *tagp; /* end of entry (tag) */ + int to; /* block/leaf to index */ + xfs_trans_t *tp; /* transaction pointer */ + struct xfs_dir2_leaf_entry *ents; + struct xfs_dir3_icleaf_hdr leafhdr; + + trace_xfs_dir2_leaf_to_block(args); + + dp = args->dp; + tp = args->trans; + mp = dp->i_mount; + leaf = lbp->b_addr; + dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + ents = dp->d_ops->leaf_ents_p(leaf); + ltp = xfs_dir2_leaf_tail_p(args->geo, leaf); + + ASSERT(leafhdr.magic == XFS_DIR2_LEAF1_MAGIC || + leafhdr.magic == XFS_DIR3_LEAF1_MAGIC); + /* + * If there are data blocks other than the first one, take this + * opportunity to remove trailing empty data blocks that may have + * been left behind during no-space-reservation operations. + * These will show up in the leaf bests table. + */ + while (dp->i_d.di_size > args->geo->blksize) { + int hdrsz; + + hdrsz = dp->d_ops->data_entry_offset; + bestsp = xfs_dir2_leaf_bests_p(ltp); + if (be16_to_cpu(bestsp[be32_to_cpu(ltp->bestcount) - 1]) == + args->geo->blksize - hdrsz) { + if ((error = + xfs_dir2_leaf_trim_data(args, lbp, + (xfs_dir2_db_t)(be32_to_cpu(ltp->bestcount) - 1)))) + return error; + } else + return 0; + } + /* + * Read the data block if we don't already have it, give up if it fails. + */ + if (!dbp) { + error = xfs_dir3_data_read(tp, dp, args->geo->datablk, -1, &dbp); + if (error) + return error; + } + hdr = dbp->b_addr; + ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC)); + + /* + * Size of the "leaf" area in the block. + */ + size = (uint)sizeof(xfs_dir2_block_tail_t) + + (uint)sizeof(*lep) * (leafhdr.count - leafhdr.stale); + /* + * Look at the last data entry. + */ + tagp = (__be16 *)((char *)hdr + args->geo->blksize) - 1; + dup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp)); + /* + * If it's not free or is too short we can't do it. + */ + if (be16_to_cpu(dup->freetag) != XFS_DIR2_DATA_FREE_TAG || + be16_to_cpu(dup->length) < size) + return 0; + + /* + * Start converting it to block form. + */ + xfs_dir3_block_init(mp, tp, dbp, dp); + + needlog = 1; + needscan = 0; + /* + * Use up the space at the end of the block (blp/btp). + */ + xfs_dir2_data_use_free(args, dbp, dup, args->geo->blksize - size, size, + &needlog, &needscan); + /* + * Initialize the block tail. + */ + btp = xfs_dir2_block_tail_p(args->geo, hdr); + btp->count = cpu_to_be32(leafhdr.count - leafhdr.stale); + btp->stale = 0; + xfs_dir2_block_log_tail(tp, dbp); + /* + * Initialize the block leaf area. We compact out stale entries. + */ + lep = xfs_dir2_block_leaf_p(btp); + for (from = to = 0; from < leafhdr.count; from++) { + if (ents[from].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) + continue; + lep[to++] = ents[from]; + } + ASSERT(to == be32_to_cpu(btp->count)); + xfs_dir2_block_log_leaf(tp, dbp, 0, be32_to_cpu(btp->count) - 1); + /* + * Scan the bestfree if we need it and log the data block header. + */ + if (needscan) + xfs_dir2_data_freescan(dp, hdr, &needlog); + if (needlog) + xfs_dir2_data_log_header(args, dbp); + /* + * Pitch the old leaf block. + */ + error = xfs_da_shrink_inode(args, args->geo->leafblk, lbp); + if (error) + return error; + + /* + * Now see if the resulting block can be shrunken to shortform. + */ + size = xfs_dir2_block_sfsize(dp, hdr, &sfh); + if (size > XFS_IFORK_DSIZE(dp)) + return 0; + + return xfs_dir2_block_to_sf(args, dbp, size, &sfh); +} + +/* + * Convert the shortform directory to block form. + */ +int /* error */ +xfs_dir2_sf_to_block( + xfs_da_args_t *args) /* operation arguments */ +{ + xfs_dir2_db_t blkno; /* dir-relative block # (0) */ + xfs_dir2_data_hdr_t *hdr; /* block header */ + xfs_dir2_leaf_entry_t *blp; /* block leaf entries */ + struct xfs_buf *bp; /* block buffer */ + xfs_dir2_block_tail_t *btp; /* block tail pointer */ + xfs_dir2_data_entry_t *dep; /* data entry pointer */ + xfs_inode_t *dp; /* incore directory inode */ + int dummy; /* trash */ + xfs_dir2_data_unused_t *dup; /* unused entry pointer */ + int endoffset; /* end of data objects */ + int error; /* error return value */ + int i; /* index */ + xfs_mount_t *mp; /* filesystem mount point */ + int needlog; /* need to log block header */ + int needscan; /* need to scan block freespc */ + int newoffset; /* offset from current entry */ + int offset; /* target block offset */ + xfs_dir2_sf_entry_t *sfep; /* sf entry pointer */ + xfs_dir2_sf_hdr_t *oldsfp; /* old shortform header */ + xfs_dir2_sf_hdr_t *sfp; /* shortform header */ + __be16 *tagp; /* end of data entry */ + xfs_trans_t *tp; /* transaction pointer */ + struct xfs_name name; + struct xfs_ifork *ifp; + + trace_xfs_dir2_sf_to_block(args); + + dp = args->dp; + tp = args->trans; + mp = dp->i_mount; + ifp = XFS_IFORK_PTR(dp, XFS_DATA_FORK); + ASSERT(ifp->if_flags & XFS_IFINLINE); + /* + * Bomb out if the shortform directory is way too short. + */ + if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) { + ASSERT(XFS_FORCED_SHUTDOWN(mp)); + return -EIO; + } + + oldsfp = (xfs_dir2_sf_hdr_t *)ifp->if_u1.if_data; + + ASSERT(ifp->if_bytes == dp->i_d.di_size); + ASSERT(ifp->if_u1.if_data != NULL); + ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(oldsfp->i8count)); + ASSERT(dp->i_d.di_nextents == 0); + + /* + * Copy the directory into a temporary buffer. + * Then pitch the incore inode data so we can make extents. + */ + sfp = kmem_alloc(ifp->if_bytes, KM_SLEEP); + memcpy(sfp, oldsfp, ifp->if_bytes); + + xfs_idata_realloc(dp, -ifp->if_bytes, XFS_DATA_FORK); + xfs_bmap_local_to_extents_empty(dp, XFS_DATA_FORK); + dp->i_d.di_size = 0; + + /* + * Add block 0 to the inode. + */ + error = xfs_dir2_grow_inode(args, XFS_DIR2_DATA_SPACE, &blkno); + if (error) { + kmem_free(sfp); + return error; + } + /* + * Initialize the data block, then convert it to block format. + */ + error = xfs_dir3_data_init(args, blkno, &bp); + if (error) { + kmem_free(sfp); + return error; + } + xfs_dir3_block_init(mp, tp, bp, dp); + hdr = bp->b_addr; + + /* + * Compute size of block "tail" area. + */ + i = (uint)sizeof(*btp) + + (sfp->count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t); + /* + * The whole thing is initialized to free by the init routine. + * Say we're using the leaf and tail area. + */ + dup = dp->d_ops->data_unused_p(hdr); + needlog = needscan = 0; + xfs_dir2_data_use_free(args, bp, dup, args->geo->blksize - i, + i, &needlog, &needscan); + ASSERT(needscan == 0); + /* + * Fill in the tail. + */ + btp = xfs_dir2_block_tail_p(args->geo, hdr); + btp->count = cpu_to_be32(sfp->count + 2); /* ., .. */ + btp->stale = 0; + blp = xfs_dir2_block_leaf_p(btp); + endoffset = (uint)((char *)blp - (char *)hdr); + /* + * Remove the freespace, we'll manage it. + */ + xfs_dir2_data_use_free(args, bp, dup, + (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr), + be16_to_cpu(dup->length), &needlog, &needscan); + /* + * Create entry for . + */ + dep = dp->d_ops->data_dot_entry_p(hdr); + dep->inumber = cpu_to_be64(dp->i_ino); + dep->namelen = 1; + dep->name[0] = '.'; + dp->d_ops->data_put_ftype(dep, XFS_DIR3_FT_DIR); + tagp = dp->d_ops->data_entry_tag_p(dep); + *tagp = cpu_to_be16((char *)dep - (char *)hdr); + xfs_dir2_data_log_entry(args, bp, dep); + blp[0].hashval = cpu_to_be32(xfs_dir_hash_dot); + blp[0].address = cpu_to_be32(xfs_dir2_byte_to_dataptr( + (char *)dep - (char *)hdr)); + /* + * Create entry for .. + */ + dep = dp->d_ops->data_dotdot_entry_p(hdr); + dep->inumber = cpu_to_be64(dp->d_ops->sf_get_parent_ino(sfp)); + dep->namelen = 2; + dep->name[0] = dep->name[1] = '.'; + dp->d_ops->data_put_ftype(dep, XFS_DIR3_FT_DIR); + tagp = dp->d_ops->data_entry_tag_p(dep); + *tagp = cpu_to_be16((char *)dep - (char *)hdr); + xfs_dir2_data_log_entry(args, bp, dep); + blp[1].hashval = cpu_to_be32(xfs_dir_hash_dotdot); + blp[1].address = cpu_to_be32(xfs_dir2_byte_to_dataptr( + (char *)dep - (char *)hdr)); + offset = dp->d_ops->data_first_offset; + /* + * Loop over existing entries, stuff them in. + */ + i = 0; + if (!sfp->count) + sfep = NULL; + else + sfep = xfs_dir2_sf_firstentry(sfp); + /* + * Need to preserve the existing offset values in the sf directory. + * Insert holes (unused entries) where necessary. + */ + while (offset < endoffset) { + /* + * sfep is null when we reach the end of the list. + */ + if (sfep == NULL) + newoffset = endoffset; + else + newoffset = xfs_dir2_sf_get_offset(sfep); + /* + * There should be a hole here, make one. + */ + if (offset < newoffset) { + dup = (xfs_dir2_data_unused_t *)((char *)hdr + offset); + dup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG); + dup->length = cpu_to_be16(newoffset - offset); + *xfs_dir2_data_unused_tag_p(dup) = cpu_to_be16( + ((char *)dup - (char *)hdr)); + xfs_dir2_data_log_unused(args, bp, dup); + xfs_dir2_data_freeinsert(hdr, + dp->d_ops->data_bestfree_p(hdr), + dup, &dummy); + offset += be16_to_cpu(dup->length); + continue; + } + /* + * Copy a real entry. + */ + dep = (xfs_dir2_data_entry_t *)((char *)hdr + newoffset); + dep->inumber = cpu_to_be64(dp->d_ops->sf_get_ino(sfp, sfep)); + dep->namelen = sfep->namelen; + dp->d_ops->data_put_ftype(dep, dp->d_ops->sf_get_ftype(sfep)); + memcpy(dep->name, sfep->name, dep->namelen); + tagp = dp->d_ops->data_entry_tag_p(dep); + *tagp = cpu_to_be16((char *)dep - (char *)hdr); + xfs_dir2_data_log_entry(args, bp, dep); + name.name = sfep->name; + name.len = sfep->namelen; + blp[2 + i].hashval = cpu_to_be32(mp->m_dirnameops-> + hashname(&name)); + blp[2 + i].address = cpu_to_be32(xfs_dir2_byte_to_dataptr( + (char *)dep - (char *)hdr)); + offset = (int)((char *)(tagp + 1) - (char *)hdr); + if (++i == sfp->count) + sfep = NULL; + else + sfep = dp->d_ops->sf_nextentry(sfp, sfep); + } + /* Done with the temporary buffer */ + kmem_free(sfp); + /* + * Sort the leaf entries by hash value. + */ + xfs_sort(blp, be32_to_cpu(btp->count), sizeof(*blp), xfs_dir2_block_sort); + /* + * Log the leaf entry area and tail. + * Already logged the header in data_init, ignore needlog. + */ + ASSERT(needscan == 0); + xfs_dir2_block_log_leaf(tp, bp, 0, be32_to_cpu(btp->count) - 1); + xfs_dir2_block_log_tail(tp, bp); + xfs_dir3_data_check(dp, bp); + return 0; +} diff --git a/kernel/fs/xfs/libxfs/xfs_dir2_data.c b/kernel/fs/xfs/libxfs/xfs_dir2_data.c new file mode 100644 index 000000000..de1ea16f5 --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_dir2_data.c @@ -0,0 +1,1049 @@ +/* + * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_inode.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" +#include "xfs_error.h" +#include "xfs_trans.h" +#include "xfs_buf_item.h" +#include "xfs_cksum.h" + +/* + * Check the consistency of the data block. + * The input can also be a block-format directory. + * Return 0 is the buffer is good, otherwise an error. + */ +int +__xfs_dir3_data_check( + struct xfs_inode *dp, /* incore inode pointer */ + struct xfs_buf *bp) /* data block's buffer */ +{ + xfs_dir2_dataptr_t addr; /* addr for leaf lookup */ + xfs_dir2_data_free_t *bf; /* bestfree table */ + xfs_dir2_block_tail_t *btp=NULL; /* block tail */ + int count; /* count of entries found */ + xfs_dir2_data_hdr_t *hdr; /* data block header */ + xfs_dir2_data_entry_t *dep; /* data entry */ + xfs_dir2_data_free_t *dfp; /* bestfree entry */ + xfs_dir2_data_unused_t *dup; /* unused entry */ + char *endp; /* end of useful data */ + int freeseen; /* mask of bestfrees seen */ + xfs_dahash_t hash; /* hash of current name */ + int i; /* leaf index */ + int lastfree; /* last entry was unused */ + xfs_dir2_leaf_entry_t *lep=NULL; /* block leaf entries */ + xfs_mount_t *mp; /* filesystem mount point */ + char *p; /* current data position */ + int stale; /* count of stale leaves */ + struct xfs_name name; + const struct xfs_dir_ops *ops; + struct xfs_da_geometry *geo; + + mp = bp->b_target->bt_mount; + geo = mp->m_dir_geo; + + /* + * We can be passed a null dp here from a verifier, so we need to go the + * hard way to get them. + */ + ops = xfs_dir_get_ops(mp, dp); + + hdr = bp->b_addr; + p = (char *)ops->data_entry_p(hdr); + + switch (hdr->magic) { + case cpu_to_be32(XFS_DIR3_BLOCK_MAGIC): + case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC): + btp = xfs_dir2_block_tail_p(geo, hdr); + lep = xfs_dir2_block_leaf_p(btp); + endp = (char *)lep; + + /* + * The number of leaf entries is limited by the size of the + * block and the amount of space used by the data entries. + * We don't know how much space is used by the data entries yet, + * so just ensure that the count falls somewhere inside the + * block right now. + */ + XFS_WANT_CORRUPTED_RETURN(mp, be32_to_cpu(btp->count) < + ((char *)btp - p) / sizeof(struct xfs_dir2_leaf_entry)); + break; + case cpu_to_be32(XFS_DIR3_DATA_MAGIC): + case cpu_to_be32(XFS_DIR2_DATA_MAGIC): + endp = (char *)hdr + geo->blksize; + break; + default: + XFS_ERROR_REPORT("Bad Magic", XFS_ERRLEVEL_LOW, mp); + return -EFSCORRUPTED; + } + + /* + * Account for zero bestfree entries. + */ + bf = ops->data_bestfree_p(hdr); + count = lastfree = freeseen = 0; + if (!bf[0].length) { + XFS_WANT_CORRUPTED_RETURN(mp, !bf[0].offset); + freeseen |= 1 << 0; + } + if (!bf[1].length) { + XFS_WANT_CORRUPTED_RETURN(mp, !bf[1].offset); + freeseen |= 1 << 1; + } + if (!bf[2].length) { + XFS_WANT_CORRUPTED_RETURN(mp, !bf[2].offset); + freeseen |= 1 << 2; + } + + XFS_WANT_CORRUPTED_RETURN(mp, be16_to_cpu(bf[0].length) >= + be16_to_cpu(bf[1].length)); + XFS_WANT_CORRUPTED_RETURN(mp, be16_to_cpu(bf[1].length) >= + be16_to_cpu(bf[2].length)); + /* + * Loop over the data/unused entries. + */ + while (p < endp) { + dup = (xfs_dir2_data_unused_t *)p; + /* + * If it's unused, look for the space in the bestfree table. + * If we find it, account for that, else make sure it + * doesn't need to be there. + */ + if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { + XFS_WANT_CORRUPTED_RETURN(mp, lastfree == 0); + XFS_WANT_CORRUPTED_RETURN(mp, + be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)) == + (char *)dup - (char *)hdr); + dfp = xfs_dir2_data_freefind(hdr, bf, dup); + if (dfp) { + i = (int)(dfp - bf); + XFS_WANT_CORRUPTED_RETURN(mp, + (freeseen & (1 << i)) == 0); + freeseen |= 1 << i; + } else { + XFS_WANT_CORRUPTED_RETURN(mp, + be16_to_cpu(dup->length) <= + be16_to_cpu(bf[2].length)); + } + p += be16_to_cpu(dup->length); + lastfree = 1; + continue; + } + /* + * It's a real entry. Validate the fields. + * If this is a block directory then make sure it's + * in the leaf section of the block. + * The linear search is crude but this is DEBUG code. + */ + dep = (xfs_dir2_data_entry_t *)p; + XFS_WANT_CORRUPTED_RETURN(mp, dep->namelen != 0); + XFS_WANT_CORRUPTED_RETURN(mp, + !xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber))); + XFS_WANT_CORRUPTED_RETURN(mp, + be16_to_cpu(*ops->data_entry_tag_p(dep)) == + (char *)dep - (char *)hdr); + XFS_WANT_CORRUPTED_RETURN(mp, + ops->data_get_ftype(dep) < XFS_DIR3_FT_MAX); + count++; + lastfree = 0; + if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) { + addr = xfs_dir2_db_off_to_dataptr(geo, geo->datablk, + (xfs_dir2_data_aoff_t) + ((char *)dep - (char *)hdr)); + name.name = dep->name; + name.len = dep->namelen; + hash = mp->m_dirnameops->hashname(&name); + for (i = 0; i < be32_to_cpu(btp->count); i++) { + if (be32_to_cpu(lep[i].address) == addr && + be32_to_cpu(lep[i].hashval) == hash) + break; + } + XFS_WANT_CORRUPTED_RETURN(mp, + i < be32_to_cpu(btp->count)); + } + p += ops->data_entsize(dep->namelen); + } + /* + * Need to have seen all the entries and all the bestfree slots. + */ + XFS_WANT_CORRUPTED_RETURN(mp, freeseen == 7); + if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) { + for (i = stale = 0; i < be32_to_cpu(btp->count); i++) { + if (lep[i].address == + cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) + stale++; + if (i > 0) + XFS_WANT_CORRUPTED_RETURN(mp, + be32_to_cpu(lep[i].hashval) >= + be32_to_cpu(lep[i - 1].hashval)); + } + XFS_WANT_CORRUPTED_RETURN(mp, count == + be32_to_cpu(btp->count) - be32_to_cpu(btp->stale)); + XFS_WANT_CORRUPTED_RETURN(mp, stale == be32_to_cpu(btp->stale)); + } + return 0; +} + +static bool +xfs_dir3_data_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (hdr3->magic != cpu_to_be32(XFS_DIR3_DATA_MAGIC)) + return false; + if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_uuid)) + return false; + if (be64_to_cpu(hdr3->blkno) != bp->b_bn) + return false; + } else { + if (hdr3->magic != cpu_to_be32(XFS_DIR2_DATA_MAGIC)) + return false; + } + if (__xfs_dir3_data_check(NULL, bp)) + return false; + return true; +} + +/* + * Readahead of the first block of the directory when it is opened is completely + * oblivious to the format of the directory. Hence we can either get a block + * format buffer or a data format buffer on readahead. + */ +static void +xfs_dir3_data_reada_verify( + struct xfs_buf *bp) +{ + struct xfs_dir2_data_hdr *hdr = bp->b_addr; + + switch (hdr->magic) { + case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC): + case cpu_to_be32(XFS_DIR3_BLOCK_MAGIC): + bp->b_ops = &xfs_dir3_block_buf_ops; + bp->b_ops->verify_read(bp); + return; + case cpu_to_be32(XFS_DIR2_DATA_MAGIC): + case cpu_to_be32(XFS_DIR3_DATA_MAGIC): + xfs_dir3_data_verify(bp); + return; + default: + xfs_buf_ioerror(bp, -EFSCORRUPTED); + xfs_verifier_error(bp); + break; + } +} + +static void +xfs_dir3_data_read_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + + if (xfs_sb_version_hascrc(&mp->m_sb) && + !xfs_buf_verify_cksum(bp, XFS_DIR3_DATA_CRC_OFF)) + xfs_buf_ioerror(bp, -EFSBADCRC); + else if (!xfs_dir3_data_verify(bp)) + xfs_buf_ioerror(bp, -EFSCORRUPTED); + + if (bp->b_error) + xfs_verifier_error(bp); +} + +static void +xfs_dir3_data_write_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; + struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; + + if (!xfs_dir3_data_verify(bp)) { + xfs_buf_ioerror(bp, -EFSCORRUPTED); + xfs_verifier_error(bp); + return; + } + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (bip) + hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn); + + xfs_buf_update_cksum(bp, XFS_DIR3_DATA_CRC_OFF); +} + +const struct xfs_buf_ops xfs_dir3_data_buf_ops = { + .verify_read = xfs_dir3_data_read_verify, + .verify_write = xfs_dir3_data_write_verify, +}; + +static const struct xfs_buf_ops xfs_dir3_data_reada_buf_ops = { + .verify_read = xfs_dir3_data_reada_verify, + .verify_write = xfs_dir3_data_write_verify, +}; + + +int +xfs_dir3_data_read( + struct xfs_trans *tp, + struct xfs_inode *dp, + xfs_dablk_t bno, + xfs_daddr_t mapped_bno, + struct xfs_buf **bpp) +{ + int err; + + err = xfs_da_read_buf(tp, dp, bno, mapped_bno, bpp, + XFS_DATA_FORK, &xfs_dir3_data_buf_ops); + if (!err && tp) + xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_DATA_BUF); + return err; +} + +int +xfs_dir3_data_readahead( + struct xfs_inode *dp, + xfs_dablk_t bno, + xfs_daddr_t mapped_bno) +{ + return xfs_da_reada_buf(dp, bno, mapped_bno, + XFS_DATA_FORK, &xfs_dir3_data_reada_buf_ops); +} + +/* + * Given a data block and an unused entry from that block, + * return the bestfree entry if any that corresponds to it. + */ +xfs_dir2_data_free_t * +xfs_dir2_data_freefind( + struct xfs_dir2_data_hdr *hdr, /* data block header */ + struct xfs_dir2_data_free *bf, /* bestfree table pointer */ + struct xfs_dir2_data_unused *dup) /* unused space */ +{ + xfs_dir2_data_free_t *dfp; /* bestfree entry */ + xfs_dir2_data_aoff_t off; /* offset value needed */ +#ifdef DEBUG + int matched; /* matched the value */ + int seenzero; /* saw a 0 bestfree entry */ +#endif + + off = (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr); + +#ifdef DEBUG + /* + * Validate some consistency in the bestfree table. + * Check order, non-overlapping entries, and if we find the + * one we're looking for it has to be exact. + */ + ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)); + for (dfp = &bf[0], seenzero = matched = 0; + dfp < &bf[XFS_DIR2_DATA_FD_COUNT]; + dfp++) { + if (!dfp->offset) { + ASSERT(!dfp->length); + seenzero = 1; + continue; + } + ASSERT(seenzero == 0); + if (be16_to_cpu(dfp->offset) == off) { + matched = 1; + ASSERT(dfp->length == dup->length); + } else if (off < be16_to_cpu(dfp->offset)) + ASSERT(off + be16_to_cpu(dup->length) <= be16_to_cpu(dfp->offset)); + else + ASSERT(be16_to_cpu(dfp->offset) + be16_to_cpu(dfp->length) <= off); + ASSERT(matched || be16_to_cpu(dfp->length) >= be16_to_cpu(dup->length)); + if (dfp > &bf[0]) + ASSERT(be16_to_cpu(dfp[-1].length) >= be16_to_cpu(dfp[0].length)); + } +#endif + /* + * If this is smaller than the smallest bestfree entry, + * it can't be there since they're sorted. + */ + if (be16_to_cpu(dup->length) < + be16_to_cpu(bf[XFS_DIR2_DATA_FD_COUNT - 1].length)) + return NULL; + /* + * Look at the three bestfree entries for our guy. + */ + for (dfp = &bf[0]; dfp < &bf[XFS_DIR2_DATA_FD_COUNT]; dfp++) { + if (!dfp->offset) + return NULL; + if (be16_to_cpu(dfp->offset) == off) + return dfp; + } + /* + * Didn't find it. This only happens if there are duplicate lengths. + */ + return NULL; +} + +/* + * Insert an unused-space entry into the bestfree table. + */ +xfs_dir2_data_free_t * /* entry inserted */ +xfs_dir2_data_freeinsert( + struct xfs_dir2_data_hdr *hdr, /* data block pointer */ + struct xfs_dir2_data_free *dfp, /* bestfree table pointer */ + struct xfs_dir2_data_unused *dup, /* unused space */ + int *loghead) /* log the data header (out) */ +{ + xfs_dir2_data_free_t new; /* new bestfree entry */ + + ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)); + + new.length = dup->length; + new.offset = cpu_to_be16((char *)dup - (char *)hdr); + + /* + * Insert at position 0, 1, or 2; or not at all. + */ + if (be16_to_cpu(new.length) > be16_to_cpu(dfp[0].length)) { + dfp[2] = dfp[1]; + dfp[1] = dfp[0]; + dfp[0] = new; + *loghead = 1; + return &dfp[0]; + } + if (be16_to_cpu(new.length) > be16_to_cpu(dfp[1].length)) { + dfp[2] = dfp[1]; + dfp[1] = new; + *loghead = 1; + return &dfp[1]; + } + if (be16_to_cpu(new.length) > be16_to_cpu(dfp[2].length)) { + dfp[2] = new; + *loghead = 1; + return &dfp[2]; + } + return NULL; +} + +/* + * Remove a bestfree entry from the table. + */ +STATIC void +xfs_dir2_data_freeremove( + struct xfs_dir2_data_hdr *hdr, /* data block header */ + struct xfs_dir2_data_free *bf, /* bestfree table pointer */ + struct xfs_dir2_data_free *dfp, /* bestfree entry pointer */ + int *loghead) /* out: log data header */ +{ + + ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)); + + /* + * It's the first entry, slide the next 2 up. + */ + if (dfp == &bf[0]) { + bf[0] = bf[1]; + bf[1] = bf[2]; + } + /* + * It's the second entry, slide the 3rd entry up. + */ + else if (dfp == &bf[1]) + bf[1] = bf[2]; + /* + * Must be the last entry. + */ + else + ASSERT(dfp == &bf[2]); + /* + * Clear the 3rd entry, must be zero now. + */ + bf[2].length = 0; + bf[2].offset = 0; + *loghead = 1; +} + +/* + * Given a data block, reconstruct its bestfree map. + */ +void +xfs_dir2_data_freescan( + struct xfs_inode *dp, + struct xfs_dir2_data_hdr *hdr, + int *loghead) +{ + xfs_dir2_block_tail_t *btp; /* block tail */ + xfs_dir2_data_entry_t *dep; /* active data entry */ + xfs_dir2_data_unused_t *dup; /* unused data entry */ + struct xfs_dir2_data_free *bf; + char *endp; /* end of block's data */ + char *p; /* current entry pointer */ + struct xfs_da_geometry *geo = dp->i_mount->m_dir_geo; + + ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)); + + /* + * Start by clearing the table. + */ + bf = dp->d_ops->data_bestfree_p(hdr); + memset(bf, 0, sizeof(*bf) * XFS_DIR2_DATA_FD_COUNT); + *loghead = 1; + /* + * Set up pointers. + */ + p = (char *)dp->d_ops->data_entry_p(hdr); + if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) { + btp = xfs_dir2_block_tail_p(geo, hdr); + endp = (char *)xfs_dir2_block_leaf_p(btp); + } else + endp = (char *)hdr + geo->blksize; + /* + * Loop over the block's entries. + */ + while (p < endp) { + dup = (xfs_dir2_data_unused_t *)p; + /* + * If it's a free entry, insert it. + */ + if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { + ASSERT((char *)dup - (char *)hdr == + be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup))); + xfs_dir2_data_freeinsert(hdr, bf, dup, loghead); + p += be16_to_cpu(dup->length); + } + /* + * For active entries, check their tags and skip them. + */ + else { + dep = (xfs_dir2_data_entry_t *)p; + ASSERT((char *)dep - (char *)hdr == + be16_to_cpu(*dp->d_ops->data_entry_tag_p(dep))); + p += dp->d_ops->data_entsize(dep->namelen); + } + } +} + +/* + * Initialize a data block at the given block number in the directory. + * Give back the buffer for the created block. + */ +int /* error */ +xfs_dir3_data_init( + xfs_da_args_t *args, /* directory operation args */ + xfs_dir2_db_t blkno, /* logical dir block number */ + struct xfs_buf **bpp) /* output block buffer */ +{ + struct xfs_buf *bp; /* block buffer */ + xfs_dir2_data_hdr_t *hdr; /* data block header */ + xfs_inode_t *dp; /* incore directory inode */ + xfs_dir2_data_unused_t *dup; /* unused entry pointer */ + struct xfs_dir2_data_free *bf; + int error; /* error return value */ + int i; /* bestfree index */ + xfs_mount_t *mp; /* filesystem mount point */ + xfs_trans_t *tp; /* transaction pointer */ + int t; /* temp */ + + dp = args->dp; + mp = dp->i_mount; + tp = args->trans; + /* + * Get the buffer set up for the block. + */ + error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(args->geo, blkno), + -1, &bp, XFS_DATA_FORK); + if (error) + return error; + bp->b_ops = &xfs_dir3_data_buf_ops; + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_DATA_BUF); + + /* + * Initialize the header. + */ + hdr = bp->b_addr; + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; + + memset(hdr3, 0, sizeof(*hdr3)); + hdr3->magic = cpu_to_be32(XFS_DIR3_DATA_MAGIC); + hdr3->blkno = cpu_to_be64(bp->b_bn); + hdr3->owner = cpu_to_be64(dp->i_ino); + uuid_copy(&hdr3->uuid, &mp->m_sb.sb_uuid); + + } else + hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC); + + bf = dp->d_ops->data_bestfree_p(hdr); + bf[0].offset = cpu_to_be16(dp->d_ops->data_entry_offset); + for (i = 1; i < XFS_DIR2_DATA_FD_COUNT; i++) { + bf[i].length = 0; + bf[i].offset = 0; + } + + /* + * Set up an unused entry for the block's body. + */ + dup = dp->d_ops->data_unused_p(hdr); + dup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG); + + t = args->geo->blksize - (uint)dp->d_ops->data_entry_offset; + bf[0].length = cpu_to_be16(t); + dup->length = cpu_to_be16(t); + *xfs_dir2_data_unused_tag_p(dup) = cpu_to_be16((char *)dup - (char *)hdr); + /* + * Log it and return it. + */ + xfs_dir2_data_log_header(args, bp); + xfs_dir2_data_log_unused(args, bp, dup); + *bpp = bp; + return 0; +} + +/* + * Log an active data entry from the block. + */ +void +xfs_dir2_data_log_entry( + struct xfs_da_args *args, + struct xfs_buf *bp, + xfs_dir2_data_entry_t *dep) /* data entry pointer */ +{ + struct xfs_dir2_data_hdr *hdr = bp->b_addr; + + ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)); + + xfs_trans_log_buf(args->trans, bp, (uint)((char *)dep - (char *)hdr), + (uint)((char *)(args->dp->d_ops->data_entry_tag_p(dep) + 1) - + (char *)hdr - 1)); +} + +/* + * Log a data block header. + */ +void +xfs_dir2_data_log_header( + struct xfs_da_args *args, + struct xfs_buf *bp) +{ +#ifdef DEBUG + struct xfs_dir2_data_hdr *hdr = bp->b_addr; + + ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)); +#endif + + xfs_trans_log_buf(args->trans, bp, 0, + args->dp->d_ops->data_entry_offset - 1); +} + +/* + * Log a data unused entry. + */ +void +xfs_dir2_data_log_unused( + struct xfs_da_args *args, + struct xfs_buf *bp, + xfs_dir2_data_unused_t *dup) /* data unused pointer */ +{ + xfs_dir2_data_hdr_t *hdr = bp->b_addr; + + ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)); + + /* + * Log the first part of the unused entry. + */ + xfs_trans_log_buf(args->trans, bp, (uint)((char *)dup - (char *)hdr), + (uint)((char *)&dup->length + sizeof(dup->length) - + 1 - (char *)hdr)); + /* + * Log the end (tag) of the unused entry. + */ + xfs_trans_log_buf(args->trans, bp, + (uint)((char *)xfs_dir2_data_unused_tag_p(dup) - (char *)hdr), + (uint)((char *)xfs_dir2_data_unused_tag_p(dup) - (char *)hdr + + sizeof(xfs_dir2_data_off_t) - 1)); +} + +/* + * Make a byte range in the data block unused. + * Its current contents are unimportant. + */ +void +xfs_dir2_data_make_free( + struct xfs_da_args *args, + struct xfs_buf *bp, + xfs_dir2_data_aoff_t offset, /* starting byte offset */ + xfs_dir2_data_aoff_t len, /* length in bytes */ + int *needlogp, /* out: log header */ + int *needscanp) /* out: regen bestfree */ +{ + xfs_dir2_data_hdr_t *hdr; /* data block pointer */ + xfs_dir2_data_free_t *dfp; /* bestfree pointer */ + char *endptr; /* end of data area */ + int needscan; /* need to regen bestfree */ + xfs_dir2_data_unused_t *newdup; /* new unused entry */ + xfs_dir2_data_unused_t *postdup; /* unused entry after us */ + xfs_dir2_data_unused_t *prevdup; /* unused entry before us */ + struct xfs_dir2_data_free *bf; + + hdr = bp->b_addr; + + /* + * Figure out where the end of the data area is. + */ + if (hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC)) + endptr = (char *)hdr + args->geo->blksize; + else { + xfs_dir2_block_tail_t *btp; /* block tail */ + + ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)); + btp = xfs_dir2_block_tail_p(args->geo, hdr); + endptr = (char *)xfs_dir2_block_leaf_p(btp); + } + /* + * If this isn't the start of the block, then back up to + * the previous entry and see if it's free. + */ + if (offset > args->dp->d_ops->data_entry_offset) { + __be16 *tagp; /* tag just before us */ + + tagp = (__be16 *)((char *)hdr + offset) - 1; + prevdup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp)); + if (be16_to_cpu(prevdup->freetag) != XFS_DIR2_DATA_FREE_TAG) + prevdup = NULL; + } else + prevdup = NULL; + /* + * If this isn't the end of the block, see if the entry after + * us is free. + */ + if ((char *)hdr + offset + len < endptr) { + postdup = + (xfs_dir2_data_unused_t *)((char *)hdr + offset + len); + if (be16_to_cpu(postdup->freetag) != XFS_DIR2_DATA_FREE_TAG) + postdup = NULL; + } else + postdup = NULL; + ASSERT(*needscanp == 0); + needscan = 0; + /* + * Previous and following entries are both free, + * merge everything into a single free entry. + */ + bf = args->dp->d_ops->data_bestfree_p(hdr); + if (prevdup && postdup) { + xfs_dir2_data_free_t *dfp2; /* another bestfree pointer */ + + /* + * See if prevdup and/or postdup are in bestfree table. + */ + dfp = xfs_dir2_data_freefind(hdr, bf, prevdup); + dfp2 = xfs_dir2_data_freefind(hdr, bf, postdup); + /* + * We need a rescan unless there are exactly 2 free entries + * namely our two. Then we know what's happening, otherwise + * since the third bestfree is there, there might be more + * entries. + */ + needscan = (bf[2].length != 0); + /* + * Fix up the new big freespace. + */ + be16_add_cpu(&prevdup->length, len + be16_to_cpu(postdup->length)); + *xfs_dir2_data_unused_tag_p(prevdup) = + cpu_to_be16((char *)prevdup - (char *)hdr); + xfs_dir2_data_log_unused(args, bp, prevdup); + if (!needscan) { + /* + * Has to be the case that entries 0 and 1 are + * dfp and dfp2 (don't know which is which), and + * entry 2 is empty. + * Remove entry 1 first then entry 0. + */ + ASSERT(dfp && dfp2); + if (dfp == &bf[1]) { + dfp = &bf[0]; + ASSERT(dfp2 == dfp); + dfp2 = &bf[1]; + } + xfs_dir2_data_freeremove(hdr, bf, dfp2, needlogp); + xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp); + /* + * Now insert the new entry. + */ + dfp = xfs_dir2_data_freeinsert(hdr, bf, prevdup, + needlogp); + ASSERT(dfp == &bf[0]); + ASSERT(dfp->length == prevdup->length); + ASSERT(!dfp[1].length); + ASSERT(!dfp[2].length); + } + } + /* + * The entry before us is free, merge with it. + */ + else if (prevdup) { + dfp = xfs_dir2_data_freefind(hdr, bf, prevdup); + be16_add_cpu(&prevdup->length, len); + *xfs_dir2_data_unused_tag_p(prevdup) = + cpu_to_be16((char *)prevdup - (char *)hdr); + xfs_dir2_data_log_unused(args, bp, prevdup); + /* + * If the previous entry was in the table, the new entry + * is longer, so it will be in the table too. Remove + * the old one and add the new one. + */ + if (dfp) { + xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp); + xfs_dir2_data_freeinsert(hdr, bf, prevdup, needlogp); + } + /* + * Otherwise we need a scan if the new entry is big enough. + */ + else { + needscan = be16_to_cpu(prevdup->length) > + be16_to_cpu(bf[2].length); + } + } + /* + * The following entry is free, merge with it. + */ + else if (postdup) { + dfp = xfs_dir2_data_freefind(hdr, bf, postdup); + newdup = (xfs_dir2_data_unused_t *)((char *)hdr + offset); + newdup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG); + newdup->length = cpu_to_be16(len + be16_to_cpu(postdup->length)); + *xfs_dir2_data_unused_tag_p(newdup) = + cpu_to_be16((char *)newdup - (char *)hdr); + xfs_dir2_data_log_unused(args, bp, newdup); + /* + * If the following entry was in the table, the new entry + * is longer, so it will be in the table too. Remove + * the old one and add the new one. + */ + if (dfp) { + xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp); + xfs_dir2_data_freeinsert(hdr, bf, newdup, needlogp); + } + /* + * Otherwise we need a scan if the new entry is big enough. + */ + else { + needscan = be16_to_cpu(newdup->length) > + be16_to_cpu(bf[2].length); + } + } + /* + * Neither neighbor is free. Make a new entry. + */ + else { + newdup = (xfs_dir2_data_unused_t *)((char *)hdr + offset); + newdup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG); + newdup->length = cpu_to_be16(len); + *xfs_dir2_data_unused_tag_p(newdup) = + cpu_to_be16((char *)newdup - (char *)hdr); + xfs_dir2_data_log_unused(args, bp, newdup); + xfs_dir2_data_freeinsert(hdr, bf, newdup, needlogp); + } + *needscanp = needscan; +} + +/* + * Take a byte range out of an existing unused space and make it un-free. + */ +void +xfs_dir2_data_use_free( + struct xfs_da_args *args, + struct xfs_buf *bp, + xfs_dir2_data_unused_t *dup, /* unused entry */ + xfs_dir2_data_aoff_t offset, /* starting offset to use */ + xfs_dir2_data_aoff_t len, /* length to use */ + int *needlogp, /* out: need to log header */ + int *needscanp) /* out: need regen bestfree */ +{ + xfs_dir2_data_hdr_t *hdr; /* data block header */ + xfs_dir2_data_free_t *dfp; /* bestfree pointer */ + int matchback; /* matches end of freespace */ + int matchfront; /* matches start of freespace */ + int needscan; /* need to regen bestfree */ + xfs_dir2_data_unused_t *newdup; /* new unused entry */ + xfs_dir2_data_unused_t *newdup2; /* another new unused entry */ + int oldlen; /* old unused entry's length */ + struct xfs_dir2_data_free *bf; + + hdr = bp->b_addr; + ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)); + ASSERT(be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG); + ASSERT(offset >= (char *)dup - (char *)hdr); + ASSERT(offset + len <= (char *)dup + be16_to_cpu(dup->length) - (char *)hdr); + ASSERT((char *)dup - (char *)hdr == be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup))); + /* + * Look up the entry in the bestfree table. + */ + oldlen = be16_to_cpu(dup->length); + bf = args->dp->d_ops->data_bestfree_p(hdr); + dfp = xfs_dir2_data_freefind(hdr, bf, dup); + ASSERT(dfp || oldlen <= be16_to_cpu(bf[2].length)); + /* + * Check for alignment with front and back of the entry. + */ + matchfront = (char *)dup - (char *)hdr == offset; + matchback = (char *)dup + oldlen - (char *)hdr == offset + len; + ASSERT(*needscanp == 0); + needscan = 0; + /* + * If we matched it exactly we just need to get rid of it from + * the bestfree table. + */ + if (matchfront && matchback) { + if (dfp) { + needscan = (bf[2].offset != 0); + if (!needscan) + xfs_dir2_data_freeremove(hdr, bf, dfp, + needlogp); + } + } + /* + * We match the first part of the entry. + * Make a new entry with the remaining freespace. + */ + else if (matchfront) { + newdup = (xfs_dir2_data_unused_t *)((char *)hdr + offset + len); + newdup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG); + newdup->length = cpu_to_be16(oldlen - len); + *xfs_dir2_data_unused_tag_p(newdup) = + cpu_to_be16((char *)newdup - (char *)hdr); + xfs_dir2_data_log_unused(args, bp, newdup); + /* + * If it was in the table, remove it and add the new one. + */ + if (dfp) { + xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp); + dfp = xfs_dir2_data_freeinsert(hdr, bf, newdup, + needlogp); + ASSERT(dfp != NULL); + ASSERT(dfp->length == newdup->length); + ASSERT(be16_to_cpu(dfp->offset) == (char *)newdup - (char *)hdr); + /* + * If we got inserted at the last slot, + * that means we don't know if there was a better + * choice for the last slot, or not. Rescan. + */ + needscan = dfp == &bf[2]; + } + } + /* + * We match the last part of the entry. + * Trim the allocated space off the tail of the entry. + */ + else if (matchback) { + newdup = dup; + newdup->length = cpu_to_be16(((char *)hdr + offset) - (char *)newdup); + *xfs_dir2_data_unused_tag_p(newdup) = + cpu_to_be16((char *)newdup - (char *)hdr); + xfs_dir2_data_log_unused(args, bp, newdup); + /* + * If it was in the table, remove it and add the new one. + */ + if (dfp) { + xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp); + dfp = xfs_dir2_data_freeinsert(hdr, bf, newdup, + needlogp); + ASSERT(dfp != NULL); + ASSERT(dfp->length == newdup->length); + ASSERT(be16_to_cpu(dfp->offset) == (char *)newdup - (char *)hdr); + /* + * If we got inserted at the last slot, + * that means we don't know if there was a better + * choice for the last slot, or not. Rescan. + */ + needscan = dfp == &bf[2]; + } + } + /* + * Poking out the middle of an entry. + * Make two new entries. + */ + else { + newdup = dup; + newdup->length = cpu_to_be16(((char *)hdr + offset) - (char *)newdup); + *xfs_dir2_data_unused_tag_p(newdup) = + cpu_to_be16((char *)newdup - (char *)hdr); + xfs_dir2_data_log_unused(args, bp, newdup); + newdup2 = (xfs_dir2_data_unused_t *)((char *)hdr + offset + len); + newdup2->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG); + newdup2->length = cpu_to_be16(oldlen - len - be16_to_cpu(newdup->length)); + *xfs_dir2_data_unused_tag_p(newdup2) = + cpu_to_be16((char *)newdup2 - (char *)hdr); + xfs_dir2_data_log_unused(args, bp, newdup2); + /* + * If the old entry was in the table, we need to scan + * if the 3rd entry was valid, since these entries + * are smaller than the old one. + * If we don't need to scan that means there were 1 or 2 + * entries in the table, and removing the old and adding + * the 2 new will work. + */ + if (dfp) { + needscan = (bf[2].length != 0); + if (!needscan) { + xfs_dir2_data_freeremove(hdr, bf, dfp, + needlogp); + xfs_dir2_data_freeinsert(hdr, bf, newdup, + needlogp); + xfs_dir2_data_freeinsert(hdr, bf, newdup2, + needlogp); + } + } + } + *needscanp = needscan; +} diff --git a/kernel/fs/xfs/libxfs/xfs_dir2_leaf.c b/kernel/fs/xfs/libxfs/xfs_dir2_leaf.c new file mode 100644 index 000000000..106119955 --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_dir2_leaf.c @@ -0,0 +1,1819 @@ +/* + * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_inode.h" +#include "xfs_bmap.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" +#include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_trans.h" +#include "xfs_buf_item.h" +#include "xfs_cksum.h" + +/* + * Local function declarations. + */ +static int xfs_dir2_leaf_lookup_int(xfs_da_args_t *args, struct xfs_buf **lbpp, + int *indexp, struct xfs_buf **dbpp); +static void xfs_dir3_leaf_log_bests(struct xfs_da_args *args, + struct xfs_buf *bp, int first, int last); +static void xfs_dir3_leaf_log_tail(struct xfs_da_args *args, + struct xfs_buf *bp); + +/* + * Check the internal consistency of a leaf1 block. + * Pop an assert if something is wrong. + */ +#ifdef DEBUG +#define xfs_dir3_leaf_check(dp, bp) \ +do { \ + if (!xfs_dir3_leaf1_check((dp), (bp))) \ + ASSERT(0); \ +} while (0); + +STATIC bool +xfs_dir3_leaf1_check( + struct xfs_inode *dp, + struct xfs_buf *bp) +{ + struct xfs_dir2_leaf *leaf = bp->b_addr; + struct xfs_dir3_icleaf_hdr leafhdr; + + dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + + if (leafhdr.magic == XFS_DIR3_LEAF1_MAGIC) { + struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr; + if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn) + return false; + } else if (leafhdr.magic != XFS_DIR2_LEAF1_MAGIC) + return false; + + return xfs_dir3_leaf_check_int(dp->i_mount, dp, &leafhdr, leaf); +} +#else +#define xfs_dir3_leaf_check(dp, bp) +#endif + +bool +xfs_dir3_leaf_check_int( + struct xfs_mount *mp, + struct xfs_inode *dp, + struct xfs_dir3_icleaf_hdr *hdr, + struct xfs_dir2_leaf *leaf) +{ + struct xfs_dir2_leaf_entry *ents; + xfs_dir2_leaf_tail_t *ltp; + int stale; + int i; + const struct xfs_dir_ops *ops; + struct xfs_dir3_icleaf_hdr leafhdr; + struct xfs_da_geometry *geo = mp->m_dir_geo; + + /* + * we can be passed a null dp here from a verifier, so we need to go the + * hard way to get them. + */ + ops = xfs_dir_get_ops(mp, dp); + + if (!hdr) { + ops->leaf_hdr_from_disk(&leafhdr, leaf); + hdr = &leafhdr; + } + + ents = ops->leaf_ents_p(leaf); + ltp = xfs_dir2_leaf_tail_p(geo, leaf); + + /* + * XXX (dgc): This value is not restrictive enough. + * Should factor in the size of the bests table as well. + * We can deduce a value for that from di_size. + */ + if (hdr->count > ops->leaf_max_ents(geo)) + return false; + + /* Leaves and bests don't overlap in leaf format. */ + if ((hdr->magic == XFS_DIR2_LEAF1_MAGIC || + hdr->magic == XFS_DIR3_LEAF1_MAGIC) && + (char *)&ents[hdr->count] > (char *)xfs_dir2_leaf_bests_p(ltp)) + return false; + + /* Check hash value order, count stale entries. */ + for (i = stale = 0; i < hdr->count; i++) { + if (i + 1 < hdr->count) { + if (be32_to_cpu(ents[i].hashval) > + be32_to_cpu(ents[i + 1].hashval)) + return false; + } + if (ents[i].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) + stale++; + } + if (hdr->stale != stale) + return false; + return true; +} + +/* + * We verify the magic numbers before decoding the leaf header so that on debug + * kernels we don't get assertion failures in xfs_dir3_leaf_hdr_from_disk() due + * to incorrect magic numbers. + */ +static bool +xfs_dir3_leaf_verify( + struct xfs_buf *bp, + __uint16_t magic) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_dir2_leaf *leaf = bp->b_addr; + + ASSERT(magic == XFS_DIR2_LEAF1_MAGIC || magic == XFS_DIR2_LEAFN_MAGIC); + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr; + __uint16_t magic3; + + magic3 = (magic == XFS_DIR2_LEAF1_MAGIC) ? XFS_DIR3_LEAF1_MAGIC + : XFS_DIR3_LEAFN_MAGIC; + + if (leaf3->info.hdr.magic != cpu_to_be16(magic3)) + return false; + if (!uuid_equal(&leaf3->info.uuid, &mp->m_sb.sb_uuid)) + return false; + if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn) + return false; + } else { + if (leaf->hdr.info.magic != cpu_to_be16(magic)) + return false; + } + + return xfs_dir3_leaf_check_int(mp, NULL, NULL, leaf); +} + +static void +__read_verify( + struct xfs_buf *bp, + __uint16_t magic) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + + if (xfs_sb_version_hascrc(&mp->m_sb) && + !xfs_buf_verify_cksum(bp, XFS_DIR3_LEAF_CRC_OFF)) + xfs_buf_ioerror(bp, -EFSBADCRC); + else if (!xfs_dir3_leaf_verify(bp, magic)) + xfs_buf_ioerror(bp, -EFSCORRUPTED); + + if (bp->b_error) + xfs_verifier_error(bp); +} + +static void +__write_verify( + struct xfs_buf *bp, + __uint16_t magic) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; + struct xfs_dir3_leaf_hdr *hdr3 = bp->b_addr; + + if (!xfs_dir3_leaf_verify(bp, magic)) { + xfs_buf_ioerror(bp, -EFSCORRUPTED); + xfs_verifier_error(bp); + return; + } + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (bip) + hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn); + + xfs_buf_update_cksum(bp, XFS_DIR3_LEAF_CRC_OFF); +} + +static void +xfs_dir3_leaf1_read_verify( + struct xfs_buf *bp) +{ + __read_verify(bp, XFS_DIR2_LEAF1_MAGIC); +} + +static void +xfs_dir3_leaf1_write_verify( + struct xfs_buf *bp) +{ + __write_verify(bp, XFS_DIR2_LEAF1_MAGIC); +} + +static void +xfs_dir3_leafn_read_verify( + struct xfs_buf *bp) +{ + __read_verify(bp, XFS_DIR2_LEAFN_MAGIC); +} + +static void +xfs_dir3_leafn_write_verify( + struct xfs_buf *bp) +{ + __write_verify(bp, XFS_DIR2_LEAFN_MAGIC); +} + +const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops = { + .verify_read = xfs_dir3_leaf1_read_verify, + .verify_write = xfs_dir3_leaf1_write_verify, +}; + +const struct xfs_buf_ops xfs_dir3_leafn_buf_ops = { + .verify_read = xfs_dir3_leafn_read_verify, + .verify_write = xfs_dir3_leafn_write_verify, +}; + +static int +xfs_dir3_leaf_read( + struct xfs_trans *tp, + struct xfs_inode *dp, + xfs_dablk_t fbno, + xfs_daddr_t mappedbno, + struct xfs_buf **bpp) +{ + int err; + + err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, + XFS_DATA_FORK, &xfs_dir3_leaf1_buf_ops); + if (!err && tp) + xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAF1_BUF); + return err; +} + +int +xfs_dir3_leafn_read( + struct xfs_trans *tp, + struct xfs_inode *dp, + xfs_dablk_t fbno, + xfs_daddr_t mappedbno, + struct xfs_buf **bpp) +{ + int err; + + err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, + XFS_DATA_FORK, &xfs_dir3_leafn_buf_ops); + if (!err && tp) + xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAFN_BUF); + return err; +} + +/* + * Initialize a new leaf block, leaf1 or leafn magic accepted. + */ +static void +xfs_dir3_leaf_init( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_buf *bp, + xfs_ino_t owner, + __uint16_t type) +{ + struct xfs_dir2_leaf *leaf = bp->b_addr; + + ASSERT(type == XFS_DIR2_LEAF1_MAGIC || type == XFS_DIR2_LEAFN_MAGIC); + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr; + + memset(leaf3, 0, sizeof(*leaf3)); + + leaf3->info.hdr.magic = (type == XFS_DIR2_LEAF1_MAGIC) + ? cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) + : cpu_to_be16(XFS_DIR3_LEAFN_MAGIC); + leaf3->info.blkno = cpu_to_be64(bp->b_bn); + leaf3->info.owner = cpu_to_be64(owner); + uuid_copy(&leaf3->info.uuid, &mp->m_sb.sb_uuid); + } else { + memset(leaf, 0, sizeof(*leaf)); + leaf->hdr.info.magic = cpu_to_be16(type); + } + + /* + * If it's a leaf-format directory initialize the tail. + * Caller is responsible for initialising the bests table. + */ + if (type == XFS_DIR2_LEAF1_MAGIC) { + struct xfs_dir2_leaf_tail *ltp; + + ltp = xfs_dir2_leaf_tail_p(mp->m_dir_geo, leaf); + ltp->bestcount = 0; + bp->b_ops = &xfs_dir3_leaf1_buf_ops; + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_LEAF1_BUF); + } else { + bp->b_ops = &xfs_dir3_leafn_buf_ops; + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_LEAFN_BUF); + } +} + +int +xfs_dir3_leaf_get_buf( + xfs_da_args_t *args, + xfs_dir2_db_t bno, + struct xfs_buf **bpp, + __uint16_t magic) +{ + struct xfs_inode *dp = args->dp; + struct xfs_trans *tp = args->trans; + struct xfs_mount *mp = dp->i_mount; + struct xfs_buf *bp; + int error; + + ASSERT(magic == XFS_DIR2_LEAF1_MAGIC || magic == XFS_DIR2_LEAFN_MAGIC); + ASSERT(bno >= xfs_dir2_byte_to_db(args->geo, XFS_DIR2_LEAF_OFFSET) && + bno < xfs_dir2_byte_to_db(args->geo, XFS_DIR2_FREE_OFFSET)); + + error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(args->geo, bno), + -1, &bp, XFS_DATA_FORK); + if (error) + return error; + + xfs_dir3_leaf_init(mp, tp, bp, dp->i_ino, magic); + xfs_dir3_leaf_log_header(args, bp); + if (magic == XFS_DIR2_LEAF1_MAGIC) + xfs_dir3_leaf_log_tail(args, bp); + *bpp = bp; + return 0; +} + +/* + * Convert a block form directory to a leaf form directory. + */ +int /* error */ +xfs_dir2_block_to_leaf( + xfs_da_args_t *args, /* operation arguments */ + struct xfs_buf *dbp) /* input block's buffer */ +{ + __be16 *bestsp; /* leaf's bestsp entries */ + xfs_dablk_t blkno; /* leaf block's bno */ + xfs_dir2_data_hdr_t *hdr; /* block header */ + xfs_dir2_leaf_entry_t *blp; /* block's leaf entries */ + xfs_dir2_block_tail_t *btp; /* block's tail */ + xfs_inode_t *dp; /* incore directory inode */ + int error; /* error return code */ + struct xfs_buf *lbp; /* leaf block's buffer */ + xfs_dir2_db_t ldb; /* leaf block's bno */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + xfs_dir2_leaf_tail_t *ltp; /* leaf's tail */ + int needlog; /* need to log block header */ + int needscan; /* need to rescan bestfree */ + xfs_trans_t *tp; /* transaction pointer */ + struct xfs_dir2_data_free *bf; + struct xfs_dir2_leaf_entry *ents; + struct xfs_dir3_icleaf_hdr leafhdr; + + trace_xfs_dir2_block_to_leaf(args); + + dp = args->dp; + tp = args->trans; + /* + * Add the leaf block to the inode. + * This interface will only put blocks in the leaf/node range. + * Since that's empty now, we'll get the root (block 0 in range). + */ + if ((error = xfs_da_grow_inode(args, &blkno))) { + return error; + } + ldb = xfs_dir2_da_to_db(args->geo, blkno); + ASSERT(ldb == xfs_dir2_byte_to_db(args->geo, XFS_DIR2_LEAF_OFFSET)); + /* + * Initialize the leaf block, get a buffer for it. + */ + error = xfs_dir3_leaf_get_buf(args, ldb, &lbp, XFS_DIR2_LEAF1_MAGIC); + if (error) + return error; + + leaf = lbp->b_addr; + hdr = dbp->b_addr; + xfs_dir3_data_check(dp, dbp); + btp = xfs_dir2_block_tail_p(args->geo, hdr); + blp = xfs_dir2_block_leaf_p(btp); + bf = dp->d_ops->data_bestfree_p(hdr); + ents = dp->d_ops->leaf_ents_p(leaf); + + /* + * Set the counts in the leaf header. + */ + dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + leafhdr.count = be32_to_cpu(btp->count); + leafhdr.stale = be32_to_cpu(btp->stale); + dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr); + xfs_dir3_leaf_log_header(args, lbp); + + /* + * Could compact these but I think we always do the conversion + * after squeezing out stale entries. + */ + memcpy(ents, blp, be32_to_cpu(btp->count) * sizeof(xfs_dir2_leaf_entry_t)); + xfs_dir3_leaf_log_ents(args, lbp, 0, leafhdr.count - 1); + needscan = 0; + needlog = 1; + /* + * Make the space formerly occupied by the leaf entries and block + * tail be free. + */ + xfs_dir2_data_make_free(args, dbp, + (xfs_dir2_data_aoff_t)((char *)blp - (char *)hdr), + (xfs_dir2_data_aoff_t)((char *)hdr + args->geo->blksize - + (char *)blp), + &needlog, &needscan); + /* + * Fix up the block header, make it a data block. + */ + dbp->b_ops = &xfs_dir3_data_buf_ops; + xfs_trans_buf_set_type(tp, dbp, XFS_BLFT_DIR_DATA_BUF); + if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) + hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC); + else + hdr->magic = cpu_to_be32(XFS_DIR3_DATA_MAGIC); + + if (needscan) + xfs_dir2_data_freescan(dp, hdr, &needlog); + /* + * Set up leaf tail and bests table. + */ + ltp = xfs_dir2_leaf_tail_p(args->geo, leaf); + ltp->bestcount = cpu_to_be32(1); + bestsp = xfs_dir2_leaf_bests_p(ltp); + bestsp[0] = bf[0].length; + /* + * Log the data header and leaf bests table. + */ + if (needlog) + xfs_dir2_data_log_header(args, dbp); + xfs_dir3_leaf_check(dp, lbp); + xfs_dir3_data_check(dp, dbp); + xfs_dir3_leaf_log_bests(args, lbp, 0, 0); + return 0; +} + +STATIC void +xfs_dir3_leaf_find_stale( + struct xfs_dir3_icleaf_hdr *leafhdr, + struct xfs_dir2_leaf_entry *ents, + int index, + int *lowstale, + int *highstale) +{ + /* + * Find the first stale entry before our index, if any. + */ + for (*lowstale = index - 1; *lowstale >= 0; --*lowstale) { + if (ents[*lowstale].address == + cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) + break; + } + + /* + * Find the first stale entry at or after our index, if any. + * Stop if the result would require moving more entries than using + * lowstale. + */ + for (*highstale = index; *highstale < leafhdr->count; ++*highstale) { + if (ents[*highstale].address == + cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) + break; + if (*lowstale >= 0 && index - *lowstale <= *highstale - index) + break; + } +} + +struct xfs_dir2_leaf_entry * +xfs_dir3_leaf_find_entry( + struct xfs_dir3_icleaf_hdr *leafhdr, + struct xfs_dir2_leaf_entry *ents, + int index, /* leaf table position */ + int compact, /* need to compact leaves */ + int lowstale, /* index of prev stale leaf */ + int highstale, /* index of next stale leaf */ + int *lfloglow, /* low leaf logging index */ + int *lfloghigh) /* high leaf logging index */ +{ + if (!leafhdr->stale) { + xfs_dir2_leaf_entry_t *lep; /* leaf entry table pointer */ + + /* + * Now we need to make room to insert the leaf entry. + * + * If there are no stale entries, just insert a hole at index. + */ + lep = &ents[index]; + if (index < leafhdr->count) + memmove(lep + 1, lep, + (leafhdr->count - index) * sizeof(*lep)); + + /* + * Record low and high logging indices for the leaf. + */ + *lfloglow = index; + *lfloghigh = leafhdr->count++; + return lep; + } + + /* + * There are stale entries. + * + * We will use one of them for the new entry. It's probably not at + * the right location, so we'll have to shift some up or down first. + * + * If we didn't compact before, we need to find the nearest stale + * entries before and after our insertion point. + */ + if (compact == 0) + xfs_dir3_leaf_find_stale(leafhdr, ents, index, + &lowstale, &highstale); + + /* + * If the low one is better, use it. + */ + if (lowstale >= 0 && + (highstale == leafhdr->count || + index - lowstale - 1 < highstale - index)) { + ASSERT(index - lowstale - 1 >= 0); + ASSERT(ents[lowstale].address == + cpu_to_be32(XFS_DIR2_NULL_DATAPTR)); + + /* + * Copy entries up to cover the stale entry and make room + * for the new entry. + */ + if (index - lowstale - 1 > 0) { + memmove(&ents[lowstale], &ents[lowstale + 1], + (index - lowstale - 1) * + sizeof(xfs_dir2_leaf_entry_t)); + } + *lfloglow = MIN(lowstale, *lfloglow); + *lfloghigh = MAX(index - 1, *lfloghigh); + leafhdr->stale--; + return &ents[index - 1]; + } + + /* + * The high one is better, so use that one. + */ + ASSERT(highstale - index >= 0); + ASSERT(ents[highstale].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)); + + /* + * Copy entries down to cover the stale entry and make room for the + * new entry. + */ + if (highstale - index > 0) { + memmove(&ents[index + 1], &ents[index], + (highstale - index) * sizeof(xfs_dir2_leaf_entry_t)); + } + *lfloglow = MIN(index, *lfloglow); + *lfloghigh = MAX(highstale, *lfloghigh); + leafhdr->stale--; + return &ents[index]; +} + +/* + * Add an entry to a leaf form directory. + */ +int /* error */ +xfs_dir2_leaf_addname( + xfs_da_args_t *args) /* operation arguments */ +{ + __be16 *bestsp; /* freespace table in leaf */ + int compact; /* need to compact leaves */ + xfs_dir2_data_hdr_t *hdr; /* data block header */ + struct xfs_buf *dbp; /* data block buffer */ + xfs_dir2_data_entry_t *dep; /* data block entry */ + xfs_inode_t *dp; /* incore directory inode */ + xfs_dir2_data_unused_t *dup; /* data unused entry */ + int error; /* error return value */ + int grown; /* allocated new data block */ + int highstale; /* index of next stale leaf */ + int i; /* temporary, index */ + int index; /* leaf table position */ + struct xfs_buf *lbp; /* leaf's buffer */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + int length; /* length of new entry */ + xfs_dir2_leaf_entry_t *lep; /* leaf entry table pointer */ + int lfloglow; /* low leaf logging index */ + int lfloghigh; /* high leaf logging index */ + int lowstale; /* index of prev stale leaf */ + xfs_dir2_leaf_tail_t *ltp; /* leaf tail pointer */ + int needbytes; /* leaf block bytes needed */ + int needlog; /* need to log data header */ + int needscan; /* need to rescan data free */ + __be16 *tagp; /* end of data entry */ + xfs_trans_t *tp; /* transaction pointer */ + xfs_dir2_db_t use_block; /* data block number */ + struct xfs_dir2_data_free *bf; /* bestfree table */ + struct xfs_dir2_leaf_entry *ents; + struct xfs_dir3_icleaf_hdr leafhdr; + + trace_xfs_dir2_leaf_addname(args); + + dp = args->dp; + tp = args->trans; + + error = xfs_dir3_leaf_read(tp, dp, args->geo->leafblk, -1, &lbp); + if (error) + return error; + + /* + * Look up the entry by hash value and name. + * We know it's not there, our caller has already done a lookup. + * So the index is of the entry to insert in front of. + * But if there are dup hash values the index is of the first of those. + */ + index = xfs_dir2_leaf_search_hash(args, lbp); + leaf = lbp->b_addr; + ltp = xfs_dir2_leaf_tail_p(args->geo, leaf); + ents = dp->d_ops->leaf_ents_p(leaf); + dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + bestsp = xfs_dir2_leaf_bests_p(ltp); + length = dp->d_ops->data_entsize(args->namelen); + + /* + * See if there are any entries with the same hash value + * and space in their block for the new entry. + * This is good because it puts multiple same-hash value entries + * in a data block, improving the lookup of those entries. + */ + for (use_block = -1, lep = &ents[index]; + index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval; + index++, lep++) { + if (be32_to_cpu(lep->address) == XFS_DIR2_NULL_DATAPTR) + continue; + i = xfs_dir2_dataptr_to_db(args->geo, be32_to_cpu(lep->address)); + ASSERT(i < be32_to_cpu(ltp->bestcount)); + ASSERT(bestsp[i] != cpu_to_be16(NULLDATAOFF)); + if (be16_to_cpu(bestsp[i]) >= length) { + use_block = i; + break; + } + } + /* + * Didn't find a block yet, linear search all the data blocks. + */ + if (use_block == -1) { + for (i = 0; i < be32_to_cpu(ltp->bestcount); i++) { + /* + * Remember a block we see that's missing. + */ + if (bestsp[i] == cpu_to_be16(NULLDATAOFF) && + use_block == -1) + use_block = i; + else if (be16_to_cpu(bestsp[i]) >= length) { + use_block = i; + break; + } + } + } + /* + * How many bytes do we need in the leaf block? + */ + needbytes = 0; + if (!leafhdr.stale) + needbytes += sizeof(xfs_dir2_leaf_entry_t); + if (use_block == -1) + needbytes += sizeof(xfs_dir2_data_off_t); + + /* + * Now kill use_block if it refers to a missing block, so we + * can use it as an indication of allocation needed. + */ + if (use_block != -1 && bestsp[use_block] == cpu_to_be16(NULLDATAOFF)) + use_block = -1; + /* + * If we don't have enough free bytes but we can make enough + * by compacting out stale entries, we'll do that. + */ + if ((char *)bestsp - (char *)&ents[leafhdr.count] < needbytes && + leafhdr.stale > 1) + compact = 1; + + /* + * Otherwise if we don't have enough free bytes we need to + * convert to node form. + */ + else if ((char *)bestsp - (char *)&ents[leafhdr.count] < needbytes) { + /* + * Just checking or no space reservation, give up. + */ + if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || + args->total == 0) { + xfs_trans_brelse(tp, lbp); + return -ENOSPC; + } + /* + * Convert to node form. + */ + error = xfs_dir2_leaf_to_node(args, lbp); + if (error) + return error; + /* + * Then add the new entry. + */ + return xfs_dir2_node_addname(args); + } + /* + * Otherwise it will fit without compaction. + */ + else + compact = 0; + /* + * If just checking, then it will fit unless we needed to allocate + * a new data block. + */ + if (args->op_flags & XFS_DA_OP_JUSTCHECK) { + xfs_trans_brelse(tp, lbp); + return use_block == -1 ? -ENOSPC : 0; + } + /* + * If no allocations are allowed, return now before we've + * changed anything. + */ + if (args->total == 0 && use_block == -1) { + xfs_trans_brelse(tp, lbp); + return -ENOSPC; + } + /* + * Need to compact the leaf entries, removing stale ones. + * Leave one stale entry behind - the one closest to our + * insertion index - and we'll shift that one to our insertion + * point later. + */ + if (compact) { + xfs_dir3_leaf_compact_x1(&leafhdr, ents, &index, &lowstale, + &highstale, &lfloglow, &lfloghigh); + } + /* + * There are stale entries, so we'll need log-low and log-high + * impossibly bad values later. + */ + else if (leafhdr.stale) { + lfloglow = leafhdr.count; + lfloghigh = -1; + } + /* + * If there was no data block space found, we need to allocate + * a new one. + */ + if (use_block == -1) { + /* + * Add the new data block. + */ + if ((error = xfs_dir2_grow_inode(args, XFS_DIR2_DATA_SPACE, + &use_block))) { + xfs_trans_brelse(tp, lbp); + return error; + } + /* + * Initialize the block. + */ + if ((error = xfs_dir3_data_init(args, use_block, &dbp))) { + xfs_trans_brelse(tp, lbp); + return error; + } + /* + * If we're adding a new data block on the end we need to + * extend the bests table. Copy it up one entry. + */ + if (use_block >= be32_to_cpu(ltp->bestcount)) { + bestsp--; + memmove(&bestsp[0], &bestsp[1], + be32_to_cpu(ltp->bestcount) * sizeof(bestsp[0])); + be32_add_cpu(<p->bestcount, 1); + xfs_dir3_leaf_log_tail(args, lbp); + xfs_dir3_leaf_log_bests(args, lbp, 0, + be32_to_cpu(ltp->bestcount) - 1); + } + /* + * If we're filling in a previously empty block just log it. + */ + else + xfs_dir3_leaf_log_bests(args, lbp, use_block, use_block); + hdr = dbp->b_addr; + bf = dp->d_ops->data_bestfree_p(hdr); + bestsp[use_block] = bf[0].length; + grown = 1; + } else { + /* + * Already had space in some data block. + * Just read that one in. + */ + error = xfs_dir3_data_read(tp, dp, + xfs_dir2_db_to_da(args->geo, use_block), + -1, &dbp); + if (error) { + xfs_trans_brelse(tp, lbp); + return error; + } + hdr = dbp->b_addr; + bf = dp->d_ops->data_bestfree_p(hdr); + grown = 0; + } + /* + * Point to the biggest freespace in our data block. + */ + dup = (xfs_dir2_data_unused_t *) + ((char *)hdr + be16_to_cpu(bf[0].offset)); + ASSERT(be16_to_cpu(dup->length) >= length); + needscan = needlog = 0; + /* + * Mark the initial part of our freespace in use for the new entry. + */ + xfs_dir2_data_use_free(args, dbp, dup, + (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr), length, + &needlog, &needscan); + /* + * Initialize our new entry (at last). + */ + dep = (xfs_dir2_data_entry_t *)dup; + dep->inumber = cpu_to_be64(args->inumber); + dep->namelen = args->namelen; + memcpy(dep->name, args->name, dep->namelen); + dp->d_ops->data_put_ftype(dep, args->filetype); + tagp = dp->d_ops->data_entry_tag_p(dep); + *tagp = cpu_to_be16((char *)dep - (char *)hdr); + /* + * Need to scan fix up the bestfree table. + */ + if (needscan) + xfs_dir2_data_freescan(dp, hdr, &needlog); + /* + * Need to log the data block's header. + */ + if (needlog) + xfs_dir2_data_log_header(args, dbp); + xfs_dir2_data_log_entry(args, dbp, dep); + /* + * If the bests table needs to be changed, do it. + * Log the change unless we've already done that. + */ + if (be16_to_cpu(bestsp[use_block]) != be16_to_cpu(bf[0].length)) { + bestsp[use_block] = bf[0].length; + if (!grown) + xfs_dir3_leaf_log_bests(args, lbp, use_block, use_block); + } + + lep = xfs_dir3_leaf_find_entry(&leafhdr, ents, index, compact, lowstale, + highstale, &lfloglow, &lfloghigh); + + /* + * Fill in the new leaf entry. + */ + lep->hashval = cpu_to_be32(args->hashval); + lep->address = cpu_to_be32( + xfs_dir2_db_off_to_dataptr(args->geo, use_block, + be16_to_cpu(*tagp))); + /* + * Log the leaf fields and give up the buffers. + */ + dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr); + xfs_dir3_leaf_log_header(args, lbp); + xfs_dir3_leaf_log_ents(args, lbp, lfloglow, lfloghigh); + xfs_dir3_leaf_check(dp, lbp); + xfs_dir3_data_check(dp, dbp); + return 0; +} + +/* + * Compact out any stale entries in the leaf. + * Log the header and changed leaf entries, if any. + */ +void +xfs_dir3_leaf_compact( + xfs_da_args_t *args, /* operation arguments */ + struct xfs_dir3_icleaf_hdr *leafhdr, + struct xfs_buf *bp) /* leaf buffer */ +{ + int from; /* source leaf index */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + int loglow; /* first leaf entry to log */ + int to; /* target leaf index */ + struct xfs_dir2_leaf_entry *ents; + struct xfs_inode *dp = args->dp; + + leaf = bp->b_addr; + if (!leafhdr->stale) + return; + + /* + * Compress out the stale entries in place. + */ + ents = dp->d_ops->leaf_ents_p(leaf); + for (from = to = 0, loglow = -1; from < leafhdr->count; from++) { + if (ents[from].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) + continue; + /* + * Only actually copy the entries that are different. + */ + if (from > to) { + if (loglow == -1) + loglow = to; + ents[to] = ents[from]; + } + to++; + } + /* + * Update and log the header, log the leaf entries. + */ + ASSERT(leafhdr->stale == from - to); + leafhdr->count -= leafhdr->stale; + leafhdr->stale = 0; + + dp->d_ops->leaf_hdr_to_disk(leaf, leafhdr); + xfs_dir3_leaf_log_header(args, bp); + if (loglow != -1) + xfs_dir3_leaf_log_ents(args, bp, loglow, to - 1); +} + +/* + * Compact the leaf entries, removing stale ones. + * Leave one stale entry behind - the one closest to our + * insertion index - and the caller will shift that one to our insertion + * point later. + * Return new insertion index, where the remaining stale entry is, + * and leaf logging indices. + */ +void +xfs_dir3_leaf_compact_x1( + struct xfs_dir3_icleaf_hdr *leafhdr, + struct xfs_dir2_leaf_entry *ents, + int *indexp, /* insertion index */ + int *lowstalep, /* out: stale entry before us */ + int *highstalep, /* out: stale entry after us */ + int *lowlogp, /* out: low log index */ + int *highlogp) /* out: high log index */ +{ + int from; /* source copy index */ + int highstale; /* stale entry at/after index */ + int index; /* insertion index */ + int keepstale; /* source index of kept stale */ + int lowstale; /* stale entry before index */ + int newindex=0; /* new insertion index */ + int to; /* destination copy index */ + + ASSERT(leafhdr->stale > 1); + index = *indexp; + + xfs_dir3_leaf_find_stale(leafhdr, ents, index, &lowstale, &highstale); + + /* + * Pick the better of lowstale and highstale. + */ + if (lowstale >= 0 && + (highstale == leafhdr->count || + index - lowstale <= highstale - index)) + keepstale = lowstale; + else + keepstale = highstale; + /* + * Copy the entries in place, removing all the stale entries + * except keepstale. + */ + for (from = to = 0; from < leafhdr->count; from++) { + /* + * Notice the new value of index. + */ + if (index == from) + newindex = to; + if (from != keepstale && + ents[from].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) { + if (from == to) + *lowlogp = to; + continue; + } + /* + * Record the new keepstale value for the insertion. + */ + if (from == keepstale) + lowstale = highstale = to; + /* + * Copy only the entries that have moved. + */ + if (from > to) + ents[to] = ents[from]; + to++; + } + ASSERT(from > to); + /* + * If the insertion point was past the last entry, + * set the new insertion point accordingly. + */ + if (index == from) + newindex = to; + *indexp = newindex; + /* + * Adjust the leaf header values. + */ + leafhdr->count -= from - to; + leafhdr->stale = 1; + /* + * Remember the low/high stale value only in the "right" + * direction. + */ + if (lowstale >= newindex) + lowstale = -1; + else + highstale = leafhdr->count; + *highlogp = leafhdr->count - 1; + *lowstalep = lowstale; + *highstalep = highstale; +} + +/* + * Log the bests entries indicated from a leaf1 block. + */ +static void +xfs_dir3_leaf_log_bests( + struct xfs_da_args *args, + struct xfs_buf *bp, /* leaf buffer */ + int first, /* first entry to log */ + int last) /* last entry to log */ +{ + __be16 *firstb; /* pointer to first entry */ + __be16 *lastb; /* pointer to last entry */ + struct xfs_dir2_leaf *leaf = bp->b_addr; + xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */ + + ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) || + leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC)); + + ltp = xfs_dir2_leaf_tail_p(args->geo, leaf); + firstb = xfs_dir2_leaf_bests_p(ltp) + first; + lastb = xfs_dir2_leaf_bests_p(ltp) + last; + xfs_trans_log_buf(args->trans, bp, + (uint)((char *)firstb - (char *)leaf), + (uint)((char *)lastb - (char *)leaf + sizeof(*lastb) - 1)); +} + +/* + * Log the leaf entries indicated from a leaf1 or leafn block. + */ +void +xfs_dir3_leaf_log_ents( + struct xfs_da_args *args, + struct xfs_buf *bp, + int first, + int last) +{ + xfs_dir2_leaf_entry_t *firstlep; /* pointer to first entry */ + xfs_dir2_leaf_entry_t *lastlep; /* pointer to last entry */ + struct xfs_dir2_leaf *leaf = bp->b_addr; + struct xfs_dir2_leaf_entry *ents; + + ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) || + leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) || + leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) || + leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)); + + ents = args->dp->d_ops->leaf_ents_p(leaf); + firstlep = &ents[first]; + lastlep = &ents[last]; + xfs_trans_log_buf(args->trans, bp, + (uint)((char *)firstlep - (char *)leaf), + (uint)((char *)lastlep - (char *)leaf + sizeof(*lastlep) - 1)); +} + +/* + * Log the header of the leaf1 or leafn block. + */ +void +xfs_dir3_leaf_log_header( + struct xfs_da_args *args, + struct xfs_buf *bp) +{ + struct xfs_dir2_leaf *leaf = bp->b_addr; + + ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) || + leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) || + leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) || + leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)); + + xfs_trans_log_buf(args->trans, bp, + (uint)((char *)&leaf->hdr - (char *)leaf), + args->dp->d_ops->leaf_hdr_size - 1); +} + +/* + * Log the tail of the leaf1 block. + */ +STATIC void +xfs_dir3_leaf_log_tail( + struct xfs_da_args *args, + struct xfs_buf *bp) +{ + struct xfs_dir2_leaf *leaf = bp->b_addr; + xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */ + + ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) || + leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) || + leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) || + leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)); + + ltp = xfs_dir2_leaf_tail_p(args->geo, leaf); + xfs_trans_log_buf(args->trans, bp, (uint)((char *)ltp - (char *)leaf), + (uint)(args->geo->blksize - 1)); +} + +/* + * Look up the entry referred to by args in the leaf format directory. + * Most of the work is done by the xfs_dir2_leaf_lookup_int routine which + * is also used by the node-format code. + */ +int +xfs_dir2_leaf_lookup( + xfs_da_args_t *args) /* operation arguments */ +{ + struct xfs_buf *dbp; /* data block buffer */ + xfs_dir2_data_entry_t *dep; /* data block entry */ + xfs_inode_t *dp; /* incore directory inode */ + int error; /* error return code */ + int index; /* found entry index */ + struct xfs_buf *lbp; /* leaf buffer */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + xfs_dir2_leaf_entry_t *lep; /* leaf entry */ + xfs_trans_t *tp; /* transaction pointer */ + struct xfs_dir2_leaf_entry *ents; + + trace_xfs_dir2_leaf_lookup(args); + + /* + * Look up name in the leaf block, returning both buffers and index. + */ + if ((error = xfs_dir2_leaf_lookup_int(args, &lbp, &index, &dbp))) { + return error; + } + tp = args->trans; + dp = args->dp; + xfs_dir3_leaf_check(dp, lbp); + leaf = lbp->b_addr; + ents = dp->d_ops->leaf_ents_p(leaf); + /* + * Get to the leaf entry and contained data entry address. + */ + lep = &ents[index]; + + /* + * Point to the data entry. + */ + dep = (xfs_dir2_data_entry_t *) + ((char *)dbp->b_addr + + xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(lep->address))); + /* + * Return the found inode number & CI name if appropriate + */ + args->inumber = be64_to_cpu(dep->inumber); + args->filetype = dp->d_ops->data_get_ftype(dep); + error = xfs_dir_cilookup_result(args, dep->name, dep->namelen); + xfs_trans_brelse(tp, dbp); + xfs_trans_brelse(tp, lbp); + return error; +} + +/* + * Look up name/hash in the leaf block. + * Fill in indexp with the found index, and dbpp with the data buffer. + * If not found dbpp will be NULL, and ENOENT comes back. + * lbpp will always be filled in with the leaf buffer unless there's an error. + */ +static int /* error */ +xfs_dir2_leaf_lookup_int( + xfs_da_args_t *args, /* operation arguments */ + struct xfs_buf **lbpp, /* out: leaf buffer */ + int *indexp, /* out: index in leaf block */ + struct xfs_buf **dbpp) /* out: data buffer */ +{ + xfs_dir2_db_t curdb = -1; /* current data block number */ + struct xfs_buf *dbp = NULL; /* data buffer */ + xfs_dir2_data_entry_t *dep; /* data entry */ + xfs_inode_t *dp; /* incore directory inode */ + int error; /* error return code */ + int index; /* index in leaf block */ + struct xfs_buf *lbp; /* leaf buffer */ + xfs_dir2_leaf_entry_t *lep; /* leaf entry */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + xfs_mount_t *mp; /* filesystem mount point */ + xfs_dir2_db_t newdb; /* new data block number */ + xfs_trans_t *tp; /* transaction pointer */ + xfs_dir2_db_t cidb = -1; /* case match data block no. */ + enum xfs_dacmp cmp; /* name compare result */ + struct xfs_dir2_leaf_entry *ents; + struct xfs_dir3_icleaf_hdr leafhdr; + + dp = args->dp; + tp = args->trans; + mp = dp->i_mount; + + error = xfs_dir3_leaf_read(tp, dp, args->geo->leafblk, -1, &lbp); + if (error) + return error; + + *lbpp = lbp; + leaf = lbp->b_addr; + xfs_dir3_leaf_check(dp, lbp); + ents = dp->d_ops->leaf_ents_p(leaf); + dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + + /* + * Look for the first leaf entry with our hash value. + */ + index = xfs_dir2_leaf_search_hash(args, lbp); + /* + * Loop over all the entries with the right hash value + * looking to match the name. + */ + for (lep = &ents[index]; + index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval; + lep++, index++) { + /* + * Skip over stale leaf entries. + */ + if (be32_to_cpu(lep->address) == XFS_DIR2_NULL_DATAPTR) + continue; + /* + * Get the new data block number. + */ + newdb = xfs_dir2_dataptr_to_db(args->geo, + be32_to_cpu(lep->address)); + /* + * If it's not the same as the old data block number, + * need to pitch the old one and read the new one. + */ + if (newdb != curdb) { + if (dbp) + xfs_trans_brelse(tp, dbp); + error = xfs_dir3_data_read(tp, dp, + xfs_dir2_db_to_da(args->geo, newdb), + -1, &dbp); + if (error) { + xfs_trans_brelse(tp, lbp); + return error; + } + curdb = newdb; + } + /* + * Point to the data entry. + */ + dep = (xfs_dir2_data_entry_t *)((char *)dbp->b_addr + + xfs_dir2_dataptr_to_off(args->geo, + be32_to_cpu(lep->address))); + /* + * Compare name and if it's an exact match, return the index + * and buffer. If it's the first case-insensitive match, store + * the index and buffer and continue looking for an exact match. + */ + cmp = mp->m_dirnameops->compname(args, dep->name, dep->namelen); + if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) { + args->cmpresult = cmp; + *indexp = index; + /* case exact match: return the current buffer. */ + if (cmp == XFS_CMP_EXACT) { + *dbpp = dbp; + return 0; + } + cidb = curdb; + } + } + ASSERT(args->op_flags & XFS_DA_OP_OKNOENT); + /* + * Here, we can only be doing a lookup (not a rename or remove). + * If a case-insensitive match was found earlier, re-read the + * appropriate data block if required and return it. + */ + if (args->cmpresult == XFS_CMP_CASE) { + ASSERT(cidb != -1); + if (cidb != curdb) { + xfs_trans_brelse(tp, dbp); + error = xfs_dir3_data_read(tp, dp, + xfs_dir2_db_to_da(args->geo, cidb), + -1, &dbp); + if (error) { + xfs_trans_brelse(tp, lbp); + return error; + } + } + *dbpp = dbp; + return 0; + } + /* + * No match found, return -ENOENT. + */ + ASSERT(cidb == -1); + if (dbp) + xfs_trans_brelse(tp, dbp); + xfs_trans_brelse(tp, lbp); + return -ENOENT; +} + +/* + * Remove an entry from a leaf format directory. + */ +int /* error */ +xfs_dir2_leaf_removename( + xfs_da_args_t *args) /* operation arguments */ +{ + __be16 *bestsp; /* leaf block best freespace */ + xfs_dir2_data_hdr_t *hdr; /* data block header */ + xfs_dir2_db_t db; /* data block number */ + struct xfs_buf *dbp; /* data block buffer */ + xfs_dir2_data_entry_t *dep; /* data entry structure */ + xfs_inode_t *dp; /* incore directory inode */ + int error; /* error return code */ + xfs_dir2_db_t i; /* temporary data block # */ + int index; /* index into leaf entries */ + struct xfs_buf *lbp; /* leaf buffer */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + xfs_dir2_leaf_entry_t *lep; /* leaf entry */ + xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */ + int needlog; /* need to log data header */ + int needscan; /* need to rescan data frees */ + xfs_dir2_data_off_t oldbest; /* old value of best free */ + struct xfs_dir2_data_free *bf; /* bestfree table */ + struct xfs_dir2_leaf_entry *ents; + struct xfs_dir3_icleaf_hdr leafhdr; + + trace_xfs_dir2_leaf_removename(args); + + /* + * Lookup the leaf entry, get the leaf and data blocks read in. + */ + if ((error = xfs_dir2_leaf_lookup_int(args, &lbp, &index, &dbp))) { + return error; + } + dp = args->dp; + leaf = lbp->b_addr; + hdr = dbp->b_addr; + xfs_dir3_data_check(dp, dbp); + bf = dp->d_ops->data_bestfree_p(hdr); + dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + ents = dp->d_ops->leaf_ents_p(leaf); + /* + * Point to the leaf entry, use that to point to the data entry. + */ + lep = &ents[index]; + db = xfs_dir2_dataptr_to_db(args->geo, be32_to_cpu(lep->address)); + dep = (xfs_dir2_data_entry_t *)((char *)hdr + + xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(lep->address))); + needscan = needlog = 0; + oldbest = be16_to_cpu(bf[0].length); + ltp = xfs_dir2_leaf_tail_p(args->geo, leaf); + bestsp = xfs_dir2_leaf_bests_p(ltp); + ASSERT(be16_to_cpu(bestsp[db]) == oldbest); + /* + * Mark the former data entry unused. + */ + xfs_dir2_data_make_free(args, dbp, + (xfs_dir2_data_aoff_t)((char *)dep - (char *)hdr), + dp->d_ops->data_entsize(dep->namelen), &needlog, &needscan); + /* + * We just mark the leaf entry stale by putting a null in it. + */ + leafhdr.stale++; + dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr); + xfs_dir3_leaf_log_header(args, lbp); + + lep->address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR); + xfs_dir3_leaf_log_ents(args, lbp, index, index); + + /* + * Scan the freespace in the data block again if necessary, + * log the data block header if necessary. + */ + if (needscan) + xfs_dir2_data_freescan(dp, hdr, &needlog); + if (needlog) + xfs_dir2_data_log_header(args, dbp); + /* + * If the longest freespace in the data block has changed, + * put the new value in the bests table and log that. + */ + if (be16_to_cpu(bf[0].length) != oldbest) { + bestsp[db] = bf[0].length; + xfs_dir3_leaf_log_bests(args, lbp, db, db); + } + xfs_dir3_data_check(dp, dbp); + /* + * If the data block is now empty then get rid of the data block. + */ + if (be16_to_cpu(bf[0].length) == + args->geo->blksize - dp->d_ops->data_entry_offset) { + ASSERT(db != args->geo->datablk); + if ((error = xfs_dir2_shrink_inode(args, db, dbp))) { + /* + * Nope, can't get rid of it because it caused + * allocation of a bmap btree block to do so. + * Just go on, returning success, leaving the + * empty block in place. + */ + if (error == -ENOSPC && args->total == 0) + error = 0; + xfs_dir3_leaf_check(dp, lbp); + return error; + } + dbp = NULL; + /* + * If this is the last data block then compact the + * bests table by getting rid of entries. + */ + if (db == be32_to_cpu(ltp->bestcount) - 1) { + /* + * Look for the last active entry (i). + */ + for (i = db - 1; i > 0; i--) { + if (bestsp[i] != cpu_to_be16(NULLDATAOFF)) + break; + } + /* + * Copy the table down so inactive entries at the + * end are removed. + */ + memmove(&bestsp[db - i], bestsp, + (be32_to_cpu(ltp->bestcount) - (db - i)) * sizeof(*bestsp)); + be32_add_cpu(<p->bestcount, -(db - i)); + xfs_dir3_leaf_log_tail(args, lbp); + xfs_dir3_leaf_log_bests(args, lbp, 0, + be32_to_cpu(ltp->bestcount) - 1); + } else + bestsp[db] = cpu_to_be16(NULLDATAOFF); + } + /* + * If the data block was not the first one, drop it. + */ + else if (db != args->geo->datablk) + dbp = NULL; + + xfs_dir3_leaf_check(dp, lbp); + /* + * See if we can convert to block form. + */ + return xfs_dir2_leaf_to_block(args, lbp, dbp); +} + +/* + * Replace the inode number in a leaf format directory entry. + */ +int /* error */ +xfs_dir2_leaf_replace( + xfs_da_args_t *args) /* operation arguments */ +{ + struct xfs_buf *dbp; /* data block buffer */ + xfs_dir2_data_entry_t *dep; /* data block entry */ + xfs_inode_t *dp; /* incore directory inode */ + int error; /* error return code */ + int index; /* index of leaf entry */ + struct xfs_buf *lbp; /* leaf buffer */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + xfs_dir2_leaf_entry_t *lep; /* leaf entry */ + xfs_trans_t *tp; /* transaction pointer */ + struct xfs_dir2_leaf_entry *ents; + + trace_xfs_dir2_leaf_replace(args); + + /* + * Look up the entry. + */ + if ((error = xfs_dir2_leaf_lookup_int(args, &lbp, &index, &dbp))) { + return error; + } + dp = args->dp; + leaf = lbp->b_addr; + ents = dp->d_ops->leaf_ents_p(leaf); + /* + * Point to the leaf entry, get data address from it. + */ + lep = &ents[index]; + /* + * Point to the data entry. + */ + dep = (xfs_dir2_data_entry_t *) + ((char *)dbp->b_addr + + xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(lep->address))); + ASSERT(args->inumber != be64_to_cpu(dep->inumber)); + /* + * Put the new inode number in, log it. + */ + dep->inumber = cpu_to_be64(args->inumber); + dp->d_ops->data_put_ftype(dep, args->filetype); + tp = args->trans; + xfs_dir2_data_log_entry(args, dbp, dep); + xfs_dir3_leaf_check(dp, lbp); + xfs_trans_brelse(tp, lbp); + return 0; +} + +/* + * Return index in the leaf block (lbp) which is either the first + * one with this hash value, or if there are none, the insert point + * for that hash value. + */ +int /* index value */ +xfs_dir2_leaf_search_hash( + xfs_da_args_t *args, /* operation arguments */ + struct xfs_buf *lbp) /* leaf buffer */ +{ + xfs_dahash_t hash=0; /* hash from this entry */ + xfs_dahash_t hashwant; /* hash value looking for */ + int high; /* high leaf index */ + int low; /* low leaf index */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + xfs_dir2_leaf_entry_t *lep; /* leaf entry */ + int mid=0; /* current leaf index */ + struct xfs_dir2_leaf_entry *ents; + struct xfs_dir3_icleaf_hdr leafhdr; + + leaf = lbp->b_addr; + ents = args->dp->d_ops->leaf_ents_p(leaf); + args->dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + + /* + * Note, the table cannot be empty, so we have to go through the loop. + * Binary search the leaf entries looking for our hash value. + */ + for (lep = ents, low = 0, high = leafhdr.count - 1, + hashwant = args->hashval; + low <= high; ) { + mid = (low + high) >> 1; + if ((hash = be32_to_cpu(lep[mid].hashval)) == hashwant) + break; + if (hash < hashwant) + low = mid + 1; + else + high = mid - 1; + } + /* + * Found one, back up through all the equal hash values. + */ + if (hash == hashwant) { + while (mid > 0 && be32_to_cpu(lep[mid - 1].hashval) == hashwant) { + mid--; + } + } + /* + * Need to point to an entry higher than ours. + */ + else if (hash < hashwant) + mid++; + return mid; +} + +/* + * Trim off a trailing data block. We know it's empty since the leaf + * freespace table says so. + */ +int /* error */ +xfs_dir2_leaf_trim_data( + xfs_da_args_t *args, /* operation arguments */ + struct xfs_buf *lbp, /* leaf buffer */ + xfs_dir2_db_t db) /* data block number */ +{ + __be16 *bestsp; /* leaf bests table */ + struct xfs_buf *dbp; /* data block buffer */ + xfs_inode_t *dp; /* incore directory inode */ + int error; /* error return value */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */ + xfs_trans_t *tp; /* transaction pointer */ + + dp = args->dp; + tp = args->trans; + /* + * Read the offending data block. We need its buffer. + */ + error = xfs_dir3_data_read(tp, dp, xfs_dir2_db_to_da(args->geo, db), + -1, &dbp); + if (error) + return error; + + leaf = lbp->b_addr; + ltp = xfs_dir2_leaf_tail_p(args->geo, leaf); + +#ifdef DEBUG +{ + struct xfs_dir2_data_hdr *hdr = dbp->b_addr; + struct xfs_dir2_data_free *bf = dp->d_ops->data_bestfree_p(hdr); + + ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC)); + ASSERT(be16_to_cpu(bf[0].length) == + args->geo->blksize - dp->d_ops->data_entry_offset); + ASSERT(db == be32_to_cpu(ltp->bestcount) - 1); +} +#endif + + /* + * Get rid of the data block. + */ + if ((error = xfs_dir2_shrink_inode(args, db, dbp))) { + ASSERT(error != -ENOSPC); + xfs_trans_brelse(tp, dbp); + return error; + } + /* + * Eliminate the last bests entry from the table. + */ + bestsp = xfs_dir2_leaf_bests_p(ltp); + be32_add_cpu(<p->bestcount, -1); + memmove(&bestsp[1], &bestsp[0], be32_to_cpu(ltp->bestcount) * sizeof(*bestsp)); + xfs_dir3_leaf_log_tail(args, lbp); + xfs_dir3_leaf_log_bests(args, lbp, 0, be32_to_cpu(ltp->bestcount) - 1); + return 0; +} + +static inline size_t +xfs_dir3_leaf_size( + struct xfs_dir3_icleaf_hdr *hdr, + int counts) +{ + int entries; + int hdrsize; + + entries = hdr->count - hdr->stale; + if (hdr->magic == XFS_DIR2_LEAF1_MAGIC || + hdr->magic == XFS_DIR2_LEAFN_MAGIC) + hdrsize = sizeof(struct xfs_dir2_leaf_hdr); + else + hdrsize = sizeof(struct xfs_dir3_leaf_hdr); + + return hdrsize + entries * sizeof(xfs_dir2_leaf_entry_t) + + counts * sizeof(xfs_dir2_data_off_t) + + sizeof(xfs_dir2_leaf_tail_t); +} + +/* + * Convert node form directory to leaf form directory. + * The root of the node form dir needs to already be a LEAFN block. + * Just return if we can't do anything. + */ +int /* error */ +xfs_dir2_node_to_leaf( + xfs_da_state_t *state) /* directory operation state */ +{ + xfs_da_args_t *args; /* operation arguments */ + xfs_inode_t *dp; /* incore directory inode */ + int error; /* error return code */ + struct xfs_buf *fbp; /* buffer for freespace block */ + xfs_fileoff_t fo; /* freespace file offset */ + xfs_dir2_free_t *free; /* freespace structure */ + struct xfs_buf *lbp; /* buffer for leaf block */ + xfs_dir2_leaf_tail_t *ltp; /* tail of leaf structure */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + xfs_mount_t *mp; /* filesystem mount point */ + int rval; /* successful free trim? */ + xfs_trans_t *tp; /* transaction pointer */ + struct xfs_dir3_icleaf_hdr leafhdr; + struct xfs_dir3_icfree_hdr freehdr; + + /* + * There's more than a leaf level in the btree, so there must + * be multiple leafn blocks. Give up. + */ + if (state->path.active > 1) + return 0; + args = state->args; + + trace_xfs_dir2_node_to_leaf(args); + + mp = state->mp; + dp = args->dp; + tp = args->trans; + /* + * Get the last offset in the file. + */ + if ((error = xfs_bmap_last_offset(dp, &fo, XFS_DATA_FORK))) { + return error; + } + fo -= args->geo->fsbcount; + /* + * If there are freespace blocks other than the first one, + * take this opportunity to remove trailing empty freespace blocks + * that may have been left behind during no-space-reservation + * operations. + */ + while (fo > args->geo->freeblk) { + if ((error = xfs_dir2_node_trim_free(args, fo, &rval))) { + return error; + } + if (rval) + fo -= args->geo->fsbcount; + else + return 0; + } + /* + * Now find the block just before the freespace block. + */ + if ((error = xfs_bmap_last_before(tp, dp, &fo, XFS_DATA_FORK))) { + return error; + } + /* + * If it's not the single leaf block, give up. + */ + if (XFS_FSB_TO_B(mp, fo) > XFS_DIR2_LEAF_OFFSET + args->geo->blksize) + return 0; + lbp = state->path.blk[0].bp; + leaf = lbp->b_addr; + dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + + ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC || + leafhdr.magic == XFS_DIR3_LEAFN_MAGIC); + + /* + * Read the freespace block. + */ + error = xfs_dir2_free_read(tp, dp, args->geo->freeblk, &fbp); + if (error) + return error; + free = fbp->b_addr; + dp->d_ops->free_hdr_from_disk(&freehdr, free); + + ASSERT(!freehdr.firstdb); + + /* + * Now see if the leafn and free data will fit in a leaf1. + * If not, release the buffer and give up. + */ + if (xfs_dir3_leaf_size(&leafhdr, freehdr.nvalid) > args->geo->blksize) { + xfs_trans_brelse(tp, fbp); + return 0; + } + + /* + * If the leaf has any stale entries in it, compress them out. + */ + if (leafhdr.stale) + xfs_dir3_leaf_compact(args, &leafhdr, lbp); + + lbp->b_ops = &xfs_dir3_leaf1_buf_ops; + xfs_trans_buf_set_type(tp, lbp, XFS_BLFT_DIR_LEAF1_BUF); + leafhdr.magic = (leafhdr.magic == XFS_DIR2_LEAFN_MAGIC) + ? XFS_DIR2_LEAF1_MAGIC + : XFS_DIR3_LEAF1_MAGIC; + + /* + * Set up the leaf tail from the freespace block. + */ + ltp = xfs_dir2_leaf_tail_p(args->geo, leaf); + ltp->bestcount = cpu_to_be32(freehdr.nvalid); + + /* + * Set up the leaf bests table. + */ + memcpy(xfs_dir2_leaf_bests_p(ltp), dp->d_ops->free_bests_p(free), + freehdr.nvalid * sizeof(xfs_dir2_data_off_t)); + + dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr); + xfs_dir3_leaf_log_header(args, lbp); + xfs_dir3_leaf_log_bests(args, lbp, 0, be32_to_cpu(ltp->bestcount) - 1); + xfs_dir3_leaf_log_tail(args, lbp); + xfs_dir3_leaf_check(dp, lbp); + + /* + * Get rid of the freespace block. + */ + error = xfs_dir2_shrink_inode(args, + xfs_dir2_byte_to_db(args->geo, XFS_DIR2_FREE_OFFSET), + fbp); + if (error) { + /* + * This can't fail here because it can only happen when + * punching out the middle of an extent, and this is an + * isolated block. + */ + ASSERT(error != -ENOSPC); + return error; + } + fbp = NULL; + /* + * Now see if we can convert the single-leaf directory + * down to a block form directory. + * This routine always kills the dabuf for the leaf, so + * eliminate it from the path. + */ + error = xfs_dir2_leaf_to_block(args, lbp, NULL); + state->path.blk[0].bp = NULL; + return error; +} diff --git a/kernel/fs/xfs/libxfs/xfs_dir2_node.c b/kernel/fs/xfs/libxfs/xfs_dir2_node.c new file mode 100644 index 000000000..41b80d3d3 --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_dir2_node.c @@ -0,0 +1,2270 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_inode.h" +#include "xfs_bmap.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" +#include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_trans.h" +#include "xfs_buf_item.h" +#include "xfs_cksum.h" + +/* + * Function declarations. + */ +static int xfs_dir2_leafn_add(struct xfs_buf *bp, xfs_da_args_t *args, + int index); +static void xfs_dir2_leafn_rebalance(xfs_da_state_t *state, + xfs_da_state_blk_t *blk1, + xfs_da_state_blk_t *blk2); +static int xfs_dir2_leafn_remove(xfs_da_args_t *args, struct xfs_buf *bp, + int index, xfs_da_state_blk_t *dblk, + int *rval); +static int xfs_dir2_node_addname_int(xfs_da_args_t *args, + xfs_da_state_blk_t *fblk); + +/* + * Check internal consistency of a leafn block. + */ +#ifdef DEBUG +#define xfs_dir3_leaf_check(dp, bp) \ +do { \ + if (!xfs_dir3_leafn_check((dp), (bp))) \ + ASSERT(0); \ +} while (0); + +static bool +xfs_dir3_leafn_check( + struct xfs_inode *dp, + struct xfs_buf *bp) +{ + struct xfs_dir2_leaf *leaf = bp->b_addr; + struct xfs_dir3_icleaf_hdr leafhdr; + + dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + + if (leafhdr.magic == XFS_DIR3_LEAFN_MAGIC) { + struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr; + if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn) + return false; + } else if (leafhdr.magic != XFS_DIR2_LEAFN_MAGIC) + return false; + + return xfs_dir3_leaf_check_int(dp->i_mount, dp, &leafhdr, leaf); +} +#else +#define xfs_dir3_leaf_check(dp, bp) +#endif + +static bool +xfs_dir3_free_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_dir2_free_hdr *hdr = bp->b_addr; + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; + + if (hdr3->magic != cpu_to_be32(XFS_DIR3_FREE_MAGIC)) + return false; + if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_uuid)) + return false; + if (be64_to_cpu(hdr3->blkno) != bp->b_bn) + return false; + } else { + if (hdr->magic != cpu_to_be32(XFS_DIR2_FREE_MAGIC)) + return false; + } + + /* XXX: should bounds check the xfs_dir3_icfree_hdr here */ + + return true; +} + +static void +xfs_dir3_free_read_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + + if (xfs_sb_version_hascrc(&mp->m_sb) && + !xfs_buf_verify_cksum(bp, XFS_DIR3_FREE_CRC_OFF)) + xfs_buf_ioerror(bp, -EFSBADCRC); + else if (!xfs_dir3_free_verify(bp)) + xfs_buf_ioerror(bp, -EFSCORRUPTED); + + if (bp->b_error) + xfs_verifier_error(bp); +} + +static void +xfs_dir3_free_write_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; + struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; + + if (!xfs_dir3_free_verify(bp)) { + xfs_buf_ioerror(bp, -EFSCORRUPTED); + xfs_verifier_error(bp); + return; + } + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (bip) + hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn); + + xfs_buf_update_cksum(bp, XFS_DIR3_FREE_CRC_OFF); +} + +const struct xfs_buf_ops xfs_dir3_free_buf_ops = { + .verify_read = xfs_dir3_free_read_verify, + .verify_write = xfs_dir3_free_write_verify, +}; + + +static int +__xfs_dir3_free_read( + struct xfs_trans *tp, + struct xfs_inode *dp, + xfs_dablk_t fbno, + xfs_daddr_t mappedbno, + struct xfs_buf **bpp) +{ + int err; + + err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, + XFS_DATA_FORK, &xfs_dir3_free_buf_ops); + + /* try read returns without an error or *bpp if it lands in a hole */ + if (!err && tp && *bpp) + xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_FREE_BUF); + return err; +} + +int +xfs_dir2_free_read( + struct xfs_trans *tp, + struct xfs_inode *dp, + xfs_dablk_t fbno, + struct xfs_buf **bpp) +{ + return __xfs_dir3_free_read(tp, dp, fbno, -1, bpp); +} + +static int +xfs_dir2_free_try_read( + struct xfs_trans *tp, + struct xfs_inode *dp, + xfs_dablk_t fbno, + struct xfs_buf **bpp) +{ + return __xfs_dir3_free_read(tp, dp, fbno, -2, bpp); +} + +static int +xfs_dir3_free_get_buf( + xfs_da_args_t *args, + xfs_dir2_db_t fbno, + struct xfs_buf **bpp) +{ + struct xfs_trans *tp = args->trans; + struct xfs_inode *dp = args->dp; + struct xfs_mount *mp = dp->i_mount; + struct xfs_buf *bp; + int error; + struct xfs_dir3_icfree_hdr hdr; + + error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(args->geo, fbno), + -1, &bp, XFS_DATA_FORK); + if (error) + return error; + + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_FREE_BUF); + bp->b_ops = &xfs_dir3_free_buf_ops; + + /* + * Initialize the new block to be empty, and remember + * its first slot as our empty slot. + */ + memset(bp->b_addr, 0, sizeof(struct xfs_dir3_free_hdr)); + memset(&hdr, 0, sizeof(hdr)); + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_dir3_free_hdr *hdr3 = bp->b_addr; + + hdr.magic = XFS_DIR3_FREE_MAGIC; + + hdr3->hdr.blkno = cpu_to_be64(bp->b_bn); + hdr3->hdr.owner = cpu_to_be64(dp->i_ino); + uuid_copy(&hdr3->hdr.uuid, &mp->m_sb.sb_uuid); + } else + hdr.magic = XFS_DIR2_FREE_MAGIC; + dp->d_ops->free_hdr_to_disk(bp->b_addr, &hdr); + *bpp = bp; + return 0; +} + +/* + * Log entries from a freespace block. + */ +STATIC void +xfs_dir2_free_log_bests( + struct xfs_da_args *args, + struct xfs_buf *bp, + int first, /* first entry to log */ + int last) /* last entry to log */ +{ + xfs_dir2_free_t *free; /* freespace structure */ + __be16 *bests; + + free = bp->b_addr; + bests = args->dp->d_ops->free_bests_p(free); + ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) || + free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC)); + xfs_trans_log_buf(args->trans, bp, + (uint)((char *)&bests[first] - (char *)free), + (uint)((char *)&bests[last] - (char *)free + + sizeof(bests[0]) - 1)); +} + +/* + * Log header from a freespace block. + */ +static void +xfs_dir2_free_log_header( + struct xfs_da_args *args, + struct xfs_buf *bp) +{ +#ifdef DEBUG + xfs_dir2_free_t *free; /* freespace structure */ + + free = bp->b_addr; + ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) || + free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC)); +#endif + xfs_trans_log_buf(args->trans, bp, 0, + args->dp->d_ops->free_hdr_size - 1); +} + +/* + * Convert a leaf-format directory to a node-format directory. + * We need to change the magic number of the leaf block, and copy + * the freespace table out of the leaf block into its own block. + */ +int /* error */ +xfs_dir2_leaf_to_node( + xfs_da_args_t *args, /* operation arguments */ + struct xfs_buf *lbp) /* leaf buffer */ +{ + xfs_inode_t *dp; /* incore directory inode */ + int error; /* error return value */ + struct xfs_buf *fbp; /* freespace buffer */ + xfs_dir2_db_t fdb; /* freespace block number */ + xfs_dir2_free_t *free; /* freespace structure */ + __be16 *from; /* pointer to freespace entry */ + int i; /* leaf freespace index */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */ + int n; /* count of live freespc ents */ + xfs_dir2_data_off_t off; /* freespace entry value */ + __be16 *to; /* pointer to freespace entry */ + xfs_trans_t *tp; /* transaction pointer */ + struct xfs_dir3_icfree_hdr freehdr; + + trace_xfs_dir2_leaf_to_node(args); + + dp = args->dp; + tp = args->trans; + /* + * Add a freespace block to the directory. + */ + if ((error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE, &fdb))) { + return error; + } + ASSERT(fdb == xfs_dir2_byte_to_db(args->geo, XFS_DIR2_FREE_OFFSET)); + /* + * Get the buffer for the new freespace block. + */ + error = xfs_dir3_free_get_buf(args, fdb, &fbp); + if (error) + return error; + + free = fbp->b_addr; + dp->d_ops->free_hdr_from_disk(&freehdr, free); + leaf = lbp->b_addr; + ltp = xfs_dir2_leaf_tail_p(args->geo, leaf); + ASSERT(be32_to_cpu(ltp->bestcount) <= + (uint)dp->i_d.di_size / args->geo->blksize); + + /* + * Copy freespace entries from the leaf block to the new block. + * Count active entries. + */ + from = xfs_dir2_leaf_bests_p(ltp); + to = dp->d_ops->free_bests_p(free); + for (i = n = 0; i < be32_to_cpu(ltp->bestcount); i++, from++, to++) { + if ((off = be16_to_cpu(*from)) != NULLDATAOFF) + n++; + *to = cpu_to_be16(off); + } + + /* + * Now initialize the freespace block header. + */ + freehdr.nused = n; + freehdr.nvalid = be32_to_cpu(ltp->bestcount); + + dp->d_ops->free_hdr_to_disk(fbp->b_addr, &freehdr); + xfs_dir2_free_log_bests(args, fbp, 0, freehdr.nvalid - 1); + xfs_dir2_free_log_header(args, fbp); + + /* + * Converting the leaf to a leafnode is just a matter of changing the + * magic number and the ops. Do the change directly to the buffer as + * it's less work (and less code) than decoding the header to host + * format and back again. + */ + if (leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC)) + leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAFN_MAGIC); + else + leaf->hdr.info.magic = cpu_to_be16(XFS_DIR3_LEAFN_MAGIC); + lbp->b_ops = &xfs_dir3_leafn_buf_ops; + xfs_trans_buf_set_type(tp, lbp, XFS_BLFT_DIR_LEAFN_BUF); + xfs_dir3_leaf_log_header(args, lbp); + xfs_dir3_leaf_check(dp, lbp); + return 0; +} + +/* + * Add a leaf entry to a leaf block in a node-form directory. + * The other work necessary is done from the caller. + */ +static int /* error */ +xfs_dir2_leafn_add( + struct xfs_buf *bp, /* leaf buffer */ + xfs_da_args_t *args, /* operation arguments */ + int index) /* insertion pt for new entry */ +{ + int compact; /* compacting stale leaves */ + xfs_inode_t *dp; /* incore directory inode */ + int highstale; /* next stale entry */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + xfs_dir2_leaf_entry_t *lep; /* leaf entry */ + int lfloghigh; /* high leaf entry logging */ + int lfloglow; /* low leaf entry logging */ + int lowstale; /* previous stale entry */ + struct xfs_dir3_icleaf_hdr leafhdr; + struct xfs_dir2_leaf_entry *ents; + + trace_xfs_dir2_leafn_add(args, index); + + dp = args->dp; + leaf = bp->b_addr; + dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + ents = dp->d_ops->leaf_ents_p(leaf); + + /* + * Quick check just to make sure we are not going to index + * into other peoples memory + */ + if (index < 0) + return -EFSCORRUPTED; + + /* + * If there are already the maximum number of leaf entries in + * the block, if there are no stale entries it won't fit. + * Caller will do a split. If there are stale entries we'll do + * a compact. + */ + + if (leafhdr.count == dp->d_ops->leaf_max_ents(args->geo)) { + if (!leafhdr.stale) + return -ENOSPC; + compact = leafhdr.stale > 1; + } else + compact = 0; + ASSERT(index == 0 || be32_to_cpu(ents[index - 1].hashval) <= args->hashval); + ASSERT(index == leafhdr.count || + be32_to_cpu(ents[index].hashval) >= args->hashval); + + if (args->op_flags & XFS_DA_OP_JUSTCHECK) + return 0; + + /* + * Compact out all but one stale leaf entry. Leaves behind + * the entry closest to index. + */ + if (compact) + xfs_dir3_leaf_compact_x1(&leafhdr, ents, &index, &lowstale, + &highstale, &lfloglow, &lfloghigh); + else if (leafhdr.stale) { + /* + * Set impossible logging indices for this case. + */ + lfloglow = leafhdr.count; + lfloghigh = -1; + } + + /* + * Insert the new entry, log everything. + */ + lep = xfs_dir3_leaf_find_entry(&leafhdr, ents, index, compact, lowstale, + highstale, &lfloglow, &lfloghigh); + + lep->hashval = cpu_to_be32(args->hashval); + lep->address = cpu_to_be32(xfs_dir2_db_off_to_dataptr(args->geo, + args->blkno, args->index)); + + dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr); + xfs_dir3_leaf_log_header(args, bp); + xfs_dir3_leaf_log_ents(args, bp, lfloglow, lfloghigh); + xfs_dir3_leaf_check(dp, bp); + return 0; +} + +#ifdef DEBUG +static void +xfs_dir2_free_hdr_check( + struct xfs_inode *dp, + struct xfs_buf *bp, + xfs_dir2_db_t db) +{ + struct xfs_dir3_icfree_hdr hdr; + + dp->d_ops->free_hdr_from_disk(&hdr, bp->b_addr); + + ASSERT((hdr.firstdb % + dp->d_ops->free_max_bests(dp->i_mount->m_dir_geo)) == 0); + ASSERT(hdr.firstdb <= db); + ASSERT(db < hdr.firstdb + hdr.nvalid); +} +#else +#define xfs_dir2_free_hdr_check(dp, bp, db) +#endif /* DEBUG */ + +/* + * Return the last hash value in the leaf. + * Stale entries are ok. + */ +xfs_dahash_t /* hash value */ +xfs_dir2_leafn_lasthash( + struct xfs_inode *dp, + struct xfs_buf *bp, /* leaf buffer */ + int *count) /* count of entries in leaf */ +{ + struct xfs_dir2_leaf *leaf = bp->b_addr; + struct xfs_dir2_leaf_entry *ents; + struct xfs_dir3_icleaf_hdr leafhdr; + + dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + + ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC || + leafhdr.magic == XFS_DIR3_LEAFN_MAGIC); + + if (count) + *count = leafhdr.count; + if (!leafhdr.count) + return 0; + + ents = dp->d_ops->leaf_ents_p(leaf); + return be32_to_cpu(ents[leafhdr.count - 1].hashval); +} + +/* + * Look up a leaf entry for space to add a name in a node-format leaf block. + * The extrablk in state is a freespace block. + */ +STATIC int +xfs_dir2_leafn_lookup_for_addname( + struct xfs_buf *bp, /* leaf buffer */ + xfs_da_args_t *args, /* operation arguments */ + int *indexp, /* out: leaf entry index */ + xfs_da_state_t *state) /* state to fill in */ +{ + struct xfs_buf *curbp = NULL; /* current data/free buffer */ + xfs_dir2_db_t curdb = -1; /* current data block number */ + xfs_dir2_db_t curfdb = -1; /* current free block number */ + xfs_inode_t *dp; /* incore directory inode */ + int error; /* error return value */ + int fi; /* free entry index */ + xfs_dir2_free_t *free = NULL; /* free block structure */ + int index; /* leaf entry index */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + int length; /* length of new data entry */ + xfs_dir2_leaf_entry_t *lep; /* leaf entry */ + xfs_mount_t *mp; /* filesystem mount point */ + xfs_dir2_db_t newdb; /* new data block number */ + xfs_dir2_db_t newfdb; /* new free block number */ + xfs_trans_t *tp; /* transaction pointer */ + struct xfs_dir2_leaf_entry *ents; + struct xfs_dir3_icleaf_hdr leafhdr; + + dp = args->dp; + tp = args->trans; + mp = dp->i_mount; + leaf = bp->b_addr; + dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + ents = dp->d_ops->leaf_ents_p(leaf); + + xfs_dir3_leaf_check(dp, bp); + ASSERT(leafhdr.count > 0); + + /* + * Look up the hash value in the leaf entries. + */ + index = xfs_dir2_leaf_search_hash(args, bp); + /* + * Do we have a buffer coming in? + */ + if (state->extravalid) { + /* If so, it's a free block buffer, get the block number. */ + curbp = state->extrablk.bp; + curfdb = state->extrablk.blkno; + free = curbp->b_addr; + ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) || + free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC)); + } + length = dp->d_ops->data_entsize(args->namelen); + /* + * Loop over leaf entries with the right hash value. + */ + for (lep = &ents[index]; + index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval; + lep++, index++) { + /* + * Skip stale leaf entries. + */ + if (be32_to_cpu(lep->address) == XFS_DIR2_NULL_DATAPTR) + continue; + /* + * Pull the data block number from the entry. + */ + newdb = xfs_dir2_dataptr_to_db(args->geo, + be32_to_cpu(lep->address)); + /* + * For addname, we're looking for a place to put the new entry. + * We want to use a data block with an entry of equal + * hash value to ours if there is one with room. + * + * If this block isn't the data block we already have + * in hand, take a look at it. + */ + if (newdb != curdb) { + __be16 *bests; + + curdb = newdb; + /* + * Convert the data block to the free block + * holding its freespace information. + */ + newfdb = dp->d_ops->db_to_fdb(args->geo, newdb); + /* + * If it's not the one we have in hand, read it in. + */ + if (newfdb != curfdb) { + /* + * If we had one before, drop it. + */ + if (curbp) + xfs_trans_brelse(tp, curbp); + + error = xfs_dir2_free_read(tp, dp, + xfs_dir2_db_to_da(args->geo, + newfdb), + &curbp); + if (error) + return error; + free = curbp->b_addr; + + xfs_dir2_free_hdr_check(dp, curbp, curdb); + } + /* + * Get the index for our entry. + */ + fi = dp->d_ops->db_to_fdindex(args->geo, curdb); + /* + * If it has room, return it. + */ + bests = dp->d_ops->free_bests_p(free); + if (unlikely(bests[fi] == cpu_to_be16(NULLDATAOFF))) { + XFS_ERROR_REPORT("xfs_dir2_leafn_lookup_int", + XFS_ERRLEVEL_LOW, mp); + if (curfdb != newfdb) + xfs_trans_brelse(tp, curbp); + return -EFSCORRUPTED; + } + curfdb = newfdb; + if (be16_to_cpu(bests[fi]) >= length) + goto out; + } + } + /* Didn't find any space */ + fi = -1; +out: + ASSERT(args->op_flags & XFS_DA_OP_OKNOENT); + if (curbp) { + /* Giving back a free block. */ + state->extravalid = 1; + state->extrablk.bp = curbp; + state->extrablk.index = fi; + state->extrablk.blkno = curfdb; + + /* + * Important: this magic number is not in the buffer - it's for + * buffer type information and therefore only the free/data type + * matters here, not whether CRCs are enabled or not. + */ + state->extrablk.magic = XFS_DIR2_FREE_MAGIC; + } else { + state->extravalid = 0; + } + /* + * Return the index, that will be the insertion point. + */ + *indexp = index; + return -ENOENT; +} + +/* + * Look up a leaf entry in a node-format leaf block. + * The extrablk in state a data block. + */ +STATIC int +xfs_dir2_leafn_lookup_for_entry( + struct xfs_buf *bp, /* leaf buffer */ + xfs_da_args_t *args, /* operation arguments */ + int *indexp, /* out: leaf entry index */ + xfs_da_state_t *state) /* state to fill in */ +{ + struct xfs_buf *curbp = NULL; /* current data/free buffer */ + xfs_dir2_db_t curdb = -1; /* current data block number */ + xfs_dir2_data_entry_t *dep; /* data block entry */ + xfs_inode_t *dp; /* incore directory inode */ + int error; /* error return value */ + int index; /* leaf entry index */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + xfs_dir2_leaf_entry_t *lep; /* leaf entry */ + xfs_mount_t *mp; /* filesystem mount point */ + xfs_dir2_db_t newdb; /* new data block number */ + xfs_trans_t *tp; /* transaction pointer */ + enum xfs_dacmp cmp; /* comparison result */ + struct xfs_dir2_leaf_entry *ents; + struct xfs_dir3_icleaf_hdr leafhdr; + + dp = args->dp; + tp = args->trans; + mp = dp->i_mount; + leaf = bp->b_addr; + dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + ents = dp->d_ops->leaf_ents_p(leaf); + + xfs_dir3_leaf_check(dp, bp); + ASSERT(leafhdr.count > 0); + + /* + * Look up the hash value in the leaf entries. + */ + index = xfs_dir2_leaf_search_hash(args, bp); + /* + * Do we have a buffer coming in? + */ + if (state->extravalid) { + curbp = state->extrablk.bp; + curdb = state->extrablk.blkno; + } + /* + * Loop over leaf entries with the right hash value. + */ + for (lep = &ents[index]; + index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval; + lep++, index++) { + /* + * Skip stale leaf entries. + */ + if (be32_to_cpu(lep->address) == XFS_DIR2_NULL_DATAPTR) + continue; + /* + * Pull the data block number from the entry. + */ + newdb = xfs_dir2_dataptr_to_db(args->geo, + be32_to_cpu(lep->address)); + /* + * Not adding a new entry, so we really want to find + * the name given to us. + * + * If it's a different data block, go get it. + */ + if (newdb != curdb) { + /* + * If we had a block before that we aren't saving + * for a CI name, drop it + */ + if (curbp && (args->cmpresult == XFS_CMP_DIFFERENT || + curdb != state->extrablk.blkno)) + xfs_trans_brelse(tp, curbp); + /* + * If needing the block that is saved with a CI match, + * use it otherwise read in the new data block. + */ + if (args->cmpresult != XFS_CMP_DIFFERENT && + newdb == state->extrablk.blkno) { + ASSERT(state->extravalid); + curbp = state->extrablk.bp; + } else { + error = xfs_dir3_data_read(tp, dp, + xfs_dir2_db_to_da(args->geo, + newdb), + -1, &curbp); + if (error) + return error; + } + xfs_dir3_data_check(dp, curbp); + curdb = newdb; + } + /* + * Point to the data entry. + */ + dep = (xfs_dir2_data_entry_t *)((char *)curbp->b_addr + + xfs_dir2_dataptr_to_off(args->geo, + be32_to_cpu(lep->address))); + /* + * Compare the entry and if it's an exact match, return + * EEXIST immediately. If it's the first case-insensitive + * match, store the block & inode number and continue looking. + */ + cmp = mp->m_dirnameops->compname(args, dep->name, dep->namelen); + if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) { + /* If there is a CI match block, drop it */ + if (args->cmpresult != XFS_CMP_DIFFERENT && + curdb != state->extrablk.blkno) + xfs_trans_brelse(tp, state->extrablk.bp); + args->cmpresult = cmp; + args->inumber = be64_to_cpu(dep->inumber); + args->filetype = dp->d_ops->data_get_ftype(dep); + *indexp = index; + state->extravalid = 1; + state->extrablk.bp = curbp; + state->extrablk.blkno = curdb; + state->extrablk.index = (int)((char *)dep - + (char *)curbp->b_addr); + state->extrablk.magic = XFS_DIR2_DATA_MAGIC; + curbp->b_ops = &xfs_dir3_data_buf_ops; + xfs_trans_buf_set_type(tp, curbp, XFS_BLFT_DIR_DATA_BUF); + if (cmp == XFS_CMP_EXACT) + return -EEXIST; + } + } + ASSERT(index == leafhdr.count || (args->op_flags & XFS_DA_OP_OKNOENT)); + if (curbp) { + if (args->cmpresult == XFS_CMP_DIFFERENT) { + /* Giving back last used data block. */ + state->extravalid = 1; + state->extrablk.bp = curbp; + state->extrablk.index = -1; + state->extrablk.blkno = curdb; + state->extrablk.magic = XFS_DIR2_DATA_MAGIC; + curbp->b_ops = &xfs_dir3_data_buf_ops; + xfs_trans_buf_set_type(tp, curbp, XFS_BLFT_DIR_DATA_BUF); + } else { + /* If the curbp is not the CI match block, drop it */ + if (state->extrablk.bp != curbp) + xfs_trans_brelse(tp, curbp); + } + } else { + state->extravalid = 0; + } + *indexp = index; + return -ENOENT; +} + +/* + * Look up a leaf entry in a node-format leaf block. + * If this is an addname then the extrablk in state is a freespace block, + * otherwise it's a data block. + */ +int +xfs_dir2_leafn_lookup_int( + struct xfs_buf *bp, /* leaf buffer */ + xfs_da_args_t *args, /* operation arguments */ + int *indexp, /* out: leaf entry index */ + xfs_da_state_t *state) /* state to fill in */ +{ + if (args->op_flags & XFS_DA_OP_ADDNAME) + return xfs_dir2_leafn_lookup_for_addname(bp, args, indexp, + state); + return xfs_dir2_leafn_lookup_for_entry(bp, args, indexp, state); +} + +/* + * Move count leaf entries from source to destination leaf. + * Log entries and headers. Stale entries are preserved. + */ +static void +xfs_dir3_leafn_moveents( + xfs_da_args_t *args, /* operation arguments */ + struct xfs_buf *bp_s, /* source */ + struct xfs_dir3_icleaf_hdr *shdr, + struct xfs_dir2_leaf_entry *sents, + int start_s,/* source leaf index */ + struct xfs_buf *bp_d, /* destination */ + struct xfs_dir3_icleaf_hdr *dhdr, + struct xfs_dir2_leaf_entry *dents, + int start_d,/* destination leaf index */ + int count) /* count of leaves to copy */ +{ + int stale; /* count stale leaves copied */ + + trace_xfs_dir2_leafn_moveents(args, start_s, start_d, count); + + /* + * Silently return if nothing to do. + */ + if (count == 0) + return; + + /* + * If the destination index is not the end of the current + * destination leaf entries, open up a hole in the destination + * to hold the new entries. + */ + if (start_d < dhdr->count) { + memmove(&dents[start_d + count], &dents[start_d], + (dhdr->count - start_d) * sizeof(xfs_dir2_leaf_entry_t)); + xfs_dir3_leaf_log_ents(args, bp_d, start_d + count, + count + dhdr->count - 1); + } + /* + * If the source has stale leaves, count the ones in the copy range + * so we can update the header correctly. + */ + if (shdr->stale) { + int i; /* temp leaf index */ + + for (i = start_s, stale = 0; i < start_s + count; i++) { + if (sents[i].address == + cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) + stale++; + } + } else + stale = 0; + /* + * Copy the leaf entries from source to destination. + */ + memcpy(&dents[start_d], &sents[start_s], + count * sizeof(xfs_dir2_leaf_entry_t)); + xfs_dir3_leaf_log_ents(args, bp_d, start_d, start_d + count - 1); + + /* + * If there are source entries after the ones we copied, + * delete the ones we copied by sliding the next ones down. + */ + if (start_s + count < shdr->count) { + memmove(&sents[start_s], &sents[start_s + count], + count * sizeof(xfs_dir2_leaf_entry_t)); + xfs_dir3_leaf_log_ents(args, bp_s, start_s, start_s + count - 1); + } + + /* + * Update the headers and log them. + */ + shdr->count -= count; + shdr->stale -= stale; + dhdr->count += count; + dhdr->stale += stale; +} + +/* + * Determine the sort order of two leaf blocks. + * Returns 1 if both are valid and leaf2 should be before leaf1, else 0. + */ +int /* sort order */ +xfs_dir2_leafn_order( + struct xfs_inode *dp, + struct xfs_buf *leaf1_bp, /* leaf1 buffer */ + struct xfs_buf *leaf2_bp) /* leaf2 buffer */ +{ + struct xfs_dir2_leaf *leaf1 = leaf1_bp->b_addr; + struct xfs_dir2_leaf *leaf2 = leaf2_bp->b_addr; + struct xfs_dir2_leaf_entry *ents1; + struct xfs_dir2_leaf_entry *ents2; + struct xfs_dir3_icleaf_hdr hdr1; + struct xfs_dir3_icleaf_hdr hdr2; + + dp->d_ops->leaf_hdr_from_disk(&hdr1, leaf1); + dp->d_ops->leaf_hdr_from_disk(&hdr2, leaf2); + ents1 = dp->d_ops->leaf_ents_p(leaf1); + ents2 = dp->d_ops->leaf_ents_p(leaf2); + + if (hdr1.count > 0 && hdr2.count > 0 && + (be32_to_cpu(ents2[0].hashval) < be32_to_cpu(ents1[0].hashval) || + be32_to_cpu(ents2[hdr2.count - 1].hashval) < + be32_to_cpu(ents1[hdr1.count - 1].hashval))) + return 1; + return 0; +} + +/* + * Rebalance leaf entries between two leaf blocks. + * This is actually only called when the second block is new, + * though the code deals with the general case. + * A new entry will be inserted in one of the blocks, and that + * entry is taken into account when balancing. + */ +static void +xfs_dir2_leafn_rebalance( + xfs_da_state_t *state, /* btree cursor */ + xfs_da_state_blk_t *blk1, /* first btree block */ + xfs_da_state_blk_t *blk2) /* second btree block */ +{ + xfs_da_args_t *args; /* operation arguments */ + int count; /* count (& direction) leaves */ + int isleft; /* new goes in left leaf */ + xfs_dir2_leaf_t *leaf1; /* first leaf structure */ + xfs_dir2_leaf_t *leaf2; /* second leaf structure */ + int mid; /* midpoint leaf index */ +#if defined(DEBUG) || defined(XFS_WARN) + int oldstale; /* old count of stale leaves */ +#endif + int oldsum; /* old total leaf count */ + int swap; /* swapped leaf blocks */ + struct xfs_dir2_leaf_entry *ents1; + struct xfs_dir2_leaf_entry *ents2; + struct xfs_dir3_icleaf_hdr hdr1; + struct xfs_dir3_icleaf_hdr hdr2; + struct xfs_inode *dp = state->args->dp; + + args = state->args; + /* + * If the block order is wrong, swap the arguments. + */ + if ((swap = xfs_dir2_leafn_order(dp, blk1->bp, blk2->bp))) { + xfs_da_state_blk_t *tmp; /* temp for block swap */ + + tmp = blk1; + blk1 = blk2; + blk2 = tmp; + } + leaf1 = blk1->bp->b_addr; + leaf2 = blk2->bp->b_addr; + dp->d_ops->leaf_hdr_from_disk(&hdr1, leaf1); + dp->d_ops->leaf_hdr_from_disk(&hdr2, leaf2); + ents1 = dp->d_ops->leaf_ents_p(leaf1); + ents2 = dp->d_ops->leaf_ents_p(leaf2); + + oldsum = hdr1.count + hdr2.count; +#if defined(DEBUG) || defined(XFS_WARN) + oldstale = hdr1.stale + hdr2.stale; +#endif + mid = oldsum >> 1; + + /* + * If the old leaf count was odd then the new one will be even, + * so we need to divide the new count evenly. + */ + if (oldsum & 1) { + xfs_dahash_t midhash; /* middle entry hash value */ + + if (mid >= hdr1.count) + midhash = be32_to_cpu(ents2[mid - hdr1.count].hashval); + else + midhash = be32_to_cpu(ents1[mid].hashval); + isleft = args->hashval <= midhash; + } + /* + * If the old count is even then the new count is odd, so there's + * no preferred side for the new entry. + * Pick the left one. + */ + else + isleft = 1; + /* + * Calculate moved entry count. Positive means left-to-right, + * negative means right-to-left. Then move the entries. + */ + count = hdr1.count - mid + (isleft == 0); + if (count > 0) + xfs_dir3_leafn_moveents(args, blk1->bp, &hdr1, ents1, + hdr1.count - count, blk2->bp, + &hdr2, ents2, 0, count); + else if (count < 0) + xfs_dir3_leafn_moveents(args, blk2->bp, &hdr2, ents2, 0, + blk1->bp, &hdr1, ents1, + hdr1.count, count); + + ASSERT(hdr1.count + hdr2.count == oldsum); + ASSERT(hdr1.stale + hdr2.stale == oldstale); + + /* log the changes made when moving the entries */ + dp->d_ops->leaf_hdr_to_disk(leaf1, &hdr1); + dp->d_ops->leaf_hdr_to_disk(leaf2, &hdr2); + xfs_dir3_leaf_log_header(args, blk1->bp); + xfs_dir3_leaf_log_header(args, blk2->bp); + + xfs_dir3_leaf_check(dp, blk1->bp); + xfs_dir3_leaf_check(dp, blk2->bp); + + /* + * Mark whether we're inserting into the old or new leaf. + */ + if (hdr1.count < hdr2.count) + state->inleaf = swap; + else if (hdr1.count > hdr2.count) + state->inleaf = !swap; + else + state->inleaf = swap ^ (blk1->index <= hdr1.count); + /* + * Adjust the expected index for insertion. + */ + if (!state->inleaf) + blk2->index = blk1->index - hdr1.count; + + /* + * Finally sanity check just to make sure we are not returning a + * negative index + */ + if (blk2->index < 0) { + state->inleaf = 1; + blk2->index = 0; + xfs_alert(dp->i_mount, + "%s: picked the wrong leaf? reverting original leaf: blk1->index %d", + __func__, blk1->index); + } +} + +static int +xfs_dir3_data_block_free( + xfs_da_args_t *args, + struct xfs_dir2_data_hdr *hdr, + struct xfs_dir2_free *free, + xfs_dir2_db_t fdb, + int findex, + struct xfs_buf *fbp, + int longest) +{ + int logfree = 0; + __be16 *bests; + struct xfs_dir3_icfree_hdr freehdr; + struct xfs_inode *dp = args->dp; + + dp->d_ops->free_hdr_from_disk(&freehdr, free); + bests = dp->d_ops->free_bests_p(free); + if (hdr) { + /* + * Data block is not empty, just set the free entry to the new + * value. + */ + bests[findex] = cpu_to_be16(longest); + xfs_dir2_free_log_bests(args, fbp, findex, findex); + return 0; + } + + /* One less used entry in the free table. */ + freehdr.nused--; + + /* + * If this was the last entry in the table, we can trim the table size + * back. There might be other entries at the end referring to + * non-existent data blocks, get those too. + */ + if (findex == freehdr.nvalid - 1) { + int i; /* free entry index */ + + for (i = findex - 1; i >= 0; i--) { + if (bests[i] != cpu_to_be16(NULLDATAOFF)) + break; + } + freehdr.nvalid = i + 1; + logfree = 0; + } else { + /* Not the last entry, just punch it out. */ + bests[findex] = cpu_to_be16(NULLDATAOFF); + logfree = 1; + } + + dp->d_ops->free_hdr_to_disk(free, &freehdr); + xfs_dir2_free_log_header(args, fbp); + + /* + * If there are no useful entries left in the block, get rid of the + * block if we can. + */ + if (!freehdr.nused) { + int error; + + error = xfs_dir2_shrink_inode(args, fdb, fbp); + if (error == 0) { + fbp = NULL; + logfree = 0; + } else if (error != -ENOSPC || args->total != 0) + return error; + /* + * It's possible to get ENOSPC if there is no + * space reservation. In this case some one + * else will eventually get rid of this block. + */ + } + + /* Log the free entry that changed, unless we got rid of it. */ + if (logfree) + xfs_dir2_free_log_bests(args, fbp, findex, findex); + return 0; +} + +/* + * Remove an entry from a node directory. + * This removes the leaf entry and the data entry, + * and updates the free block if necessary. + */ +static int /* error */ +xfs_dir2_leafn_remove( + xfs_da_args_t *args, /* operation arguments */ + struct xfs_buf *bp, /* leaf buffer */ + int index, /* leaf entry index */ + xfs_da_state_blk_t *dblk, /* data block */ + int *rval) /* resulting block needs join */ +{ + xfs_dir2_data_hdr_t *hdr; /* data block header */ + xfs_dir2_db_t db; /* data block number */ + struct xfs_buf *dbp; /* data block buffer */ + xfs_dir2_data_entry_t *dep; /* data block entry */ + xfs_inode_t *dp; /* incore directory inode */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + xfs_dir2_leaf_entry_t *lep; /* leaf entry */ + int longest; /* longest data free entry */ + int off; /* data block entry offset */ + int needlog; /* need to log data header */ + int needscan; /* need to rescan data frees */ + xfs_trans_t *tp; /* transaction pointer */ + struct xfs_dir2_data_free *bf; /* bestfree table */ + struct xfs_dir3_icleaf_hdr leafhdr; + struct xfs_dir2_leaf_entry *ents; + + trace_xfs_dir2_leafn_remove(args, index); + + dp = args->dp; + tp = args->trans; + leaf = bp->b_addr; + dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + ents = dp->d_ops->leaf_ents_p(leaf); + + /* + * Point to the entry we're removing. + */ + lep = &ents[index]; + + /* + * Extract the data block and offset from the entry. + */ + db = xfs_dir2_dataptr_to_db(args->geo, be32_to_cpu(lep->address)); + ASSERT(dblk->blkno == db); + off = xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(lep->address)); + ASSERT(dblk->index == off); + + /* + * Kill the leaf entry by marking it stale. + * Log the leaf block changes. + */ + leafhdr.stale++; + dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr); + xfs_dir3_leaf_log_header(args, bp); + + lep->address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR); + xfs_dir3_leaf_log_ents(args, bp, index, index); + + /* + * Make the data entry free. Keep track of the longest freespace + * in the data block in case it changes. + */ + dbp = dblk->bp; + hdr = dbp->b_addr; + dep = (xfs_dir2_data_entry_t *)((char *)hdr + off); + bf = dp->d_ops->data_bestfree_p(hdr); + longest = be16_to_cpu(bf[0].length); + needlog = needscan = 0; + xfs_dir2_data_make_free(args, dbp, off, + dp->d_ops->data_entsize(dep->namelen), &needlog, &needscan); + /* + * Rescan the data block freespaces for bestfree. + * Log the data block header if needed. + */ + if (needscan) + xfs_dir2_data_freescan(dp, hdr, &needlog); + if (needlog) + xfs_dir2_data_log_header(args, dbp); + xfs_dir3_data_check(dp, dbp); + /* + * If the longest data block freespace changes, need to update + * the corresponding freeblock entry. + */ + if (longest < be16_to_cpu(bf[0].length)) { + int error; /* error return value */ + struct xfs_buf *fbp; /* freeblock buffer */ + xfs_dir2_db_t fdb; /* freeblock block number */ + int findex; /* index in freeblock entries */ + xfs_dir2_free_t *free; /* freeblock structure */ + + /* + * Convert the data block number to a free block, + * read in the free block. + */ + fdb = dp->d_ops->db_to_fdb(args->geo, db); + error = xfs_dir2_free_read(tp, dp, + xfs_dir2_db_to_da(args->geo, fdb), + &fbp); + if (error) + return error; + free = fbp->b_addr; +#ifdef DEBUG + { + struct xfs_dir3_icfree_hdr freehdr; + dp->d_ops->free_hdr_from_disk(&freehdr, free); + ASSERT(freehdr.firstdb == dp->d_ops->free_max_bests(args->geo) * + (fdb - xfs_dir2_byte_to_db(args->geo, + XFS_DIR2_FREE_OFFSET))); + } +#endif + /* + * Calculate which entry we need to fix. + */ + findex = dp->d_ops->db_to_fdindex(args->geo, db); + longest = be16_to_cpu(bf[0].length); + /* + * If the data block is now empty we can get rid of it + * (usually). + */ + if (longest == args->geo->blksize - + dp->d_ops->data_entry_offset) { + /* + * Try to punch out the data block. + */ + error = xfs_dir2_shrink_inode(args, db, dbp); + if (error == 0) { + dblk->bp = NULL; + hdr = NULL; + } + /* + * We can get ENOSPC if there's no space reservation. + * In this case just drop the buffer and some one else + * will eventually get rid of the empty block. + */ + else if (!(error == -ENOSPC && args->total == 0)) + return error; + } + /* + * If we got rid of the data block, we can eliminate that entry + * in the free block. + */ + error = xfs_dir3_data_block_free(args, hdr, free, + fdb, findex, fbp, longest); + if (error) + return error; + } + + xfs_dir3_leaf_check(dp, bp); + /* + * Return indication of whether this leaf block is empty enough + * to justify trying to join it with a neighbor. + */ + *rval = (dp->d_ops->leaf_hdr_size + + (uint)sizeof(ents[0]) * (leafhdr.count - leafhdr.stale)) < + args->geo->magicpct; + return 0; +} + +/* + * Split the leaf entries in the old block into old and new blocks. + */ +int /* error */ +xfs_dir2_leafn_split( + xfs_da_state_t *state, /* btree cursor */ + xfs_da_state_blk_t *oldblk, /* original block */ + xfs_da_state_blk_t *newblk) /* newly created block */ +{ + xfs_da_args_t *args; /* operation arguments */ + xfs_dablk_t blkno; /* new leaf block number */ + int error; /* error return value */ + struct xfs_inode *dp; + + /* + * Allocate space for a new leaf node. + */ + args = state->args; + dp = args->dp; + ASSERT(oldblk->magic == XFS_DIR2_LEAFN_MAGIC); + error = xfs_da_grow_inode(args, &blkno); + if (error) { + return error; + } + /* + * Initialize the new leaf block. + */ + error = xfs_dir3_leaf_get_buf(args, xfs_dir2_da_to_db(args->geo, blkno), + &newblk->bp, XFS_DIR2_LEAFN_MAGIC); + if (error) + return error; + + newblk->blkno = blkno; + newblk->magic = XFS_DIR2_LEAFN_MAGIC; + /* + * Rebalance the entries across the two leaves, link the new + * block into the leaves. + */ + xfs_dir2_leafn_rebalance(state, oldblk, newblk); + error = xfs_da3_blk_link(state, oldblk, newblk); + if (error) { + return error; + } + /* + * Insert the new entry in the correct block. + */ + if (state->inleaf) + error = xfs_dir2_leafn_add(oldblk->bp, args, oldblk->index); + else + error = xfs_dir2_leafn_add(newblk->bp, args, newblk->index); + /* + * Update last hashval in each block since we added the name. + */ + oldblk->hashval = xfs_dir2_leafn_lasthash(dp, oldblk->bp, NULL); + newblk->hashval = xfs_dir2_leafn_lasthash(dp, newblk->bp, NULL); + xfs_dir3_leaf_check(dp, oldblk->bp); + xfs_dir3_leaf_check(dp, newblk->bp); + return error; +} + +/* + * Check a leaf block and its neighbors to see if the block should be + * collapsed into one or the other neighbor. Always keep the block + * with the smaller block number. + * If the current block is over 50% full, don't try to join it, return 0. + * If the block is empty, fill in the state structure and return 2. + * If it can be collapsed, fill in the state structure and return 1. + * If nothing can be done, return 0. + */ +int /* error */ +xfs_dir2_leafn_toosmall( + xfs_da_state_t *state, /* btree cursor */ + int *action) /* resulting action to take */ +{ + xfs_da_state_blk_t *blk; /* leaf block */ + xfs_dablk_t blkno; /* leaf block number */ + struct xfs_buf *bp; /* leaf buffer */ + int bytes; /* bytes in use */ + int count; /* leaf live entry count */ + int error; /* error return value */ + int forward; /* sibling block direction */ + int i; /* sibling counter */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + int rval; /* result from path_shift */ + struct xfs_dir3_icleaf_hdr leafhdr; + struct xfs_dir2_leaf_entry *ents; + struct xfs_inode *dp = state->args->dp; + + /* + * Check for the degenerate case of the block being over 50% full. + * If so, it's not worth even looking to see if we might be able + * to coalesce with a sibling. + */ + blk = &state->path.blk[state->path.active - 1]; + leaf = blk->bp->b_addr; + dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + ents = dp->d_ops->leaf_ents_p(leaf); + xfs_dir3_leaf_check(dp, blk->bp); + + count = leafhdr.count - leafhdr.stale; + bytes = dp->d_ops->leaf_hdr_size + count * sizeof(ents[0]); + if (bytes > (state->args->geo->blksize >> 1)) { + /* + * Blk over 50%, don't try to join. + */ + *action = 0; + return 0; + } + /* + * Check for the degenerate case of the block being empty. + * If the block is empty, we'll simply delete it, no need to + * coalesce it with a sibling block. We choose (arbitrarily) + * to merge with the forward block unless it is NULL. + */ + if (count == 0) { + /* + * Make altpath point to the block we want to keep and + * path point to the block we want to drop (this one). + */ + forward = (leafhdr.forw != 0); + memcpy(&state->altpath, &state->path, sizeof(state->path)); + error = xfs_da3_path_shift(state, &state->altpath, forward, 0, + &rval); + if (error) + return error; + *action = rval ? 2 : 0; + return 0; + } + /* + * Examine each sibling block to see if we can coalesce with + * at least 25% free space to spare. We need to figure out + * whether to merge with the forward or the backward block. + * We prefer coalescing with the lower numbered sibling so as + * to shrink a directory over time. + */ + forward = leafhdr.forw < leafhdr.back; + for (i = 0, bp = NULL; i < 2; forward = !forward, i++) { + struct xfs_dir3_icleaf_hdr hdr2; + + blkno = forward ? leafhdr.forw : leafhdr.back; + if (blkno == 0) + continue; + /* + * Read the sibling leaf block. + */ + error = xfs_dir3_leafn_read(state->args->trans, dp, + blkno, -1, &bp); + if (error) + return error; + + /* + * Count bytes in the two blocks combined. + */ + count = leafhdr.count - leafhdr.stale; + bytes = state->args->geo->blksize - + (state->args->geo->blksize >> 2); + + leaf = bp->b_addr; + dp->d_ops->leaf_hdr_from_disk(&hdr2, leaf); + ents = dp->d_ops->leaf_ents_p(leaf); + count += hdr2.count - hdr2.stale; + bytes -= count * sizeof(ents[0]); + + /* + * Fits with at least 25% to spare. + */ + if (bytes >= 0) + break; + xfs_trans_brelse(state->args->trans, bp); + } + /* + * Didn't like either block, give up. + */ + if (i >= 2) { + *action = 0; + return 0; + } + + /* + * Make altpath point to the block we want to keep (the lower + * numbered block) and path point to the block we want to drop. + */ + memcpy(&state->altpath, &state->path, sizeof(state->path)); + if (blkno < blk->blkno) + error = xfs_da3_path_shift(state, &state->altpath, forward, 0, + &rval); + else + error = xfs_da3_path_shift(state, &state->path, forward, 0, + &rval); + if (error) { + return error; + } + *action = rval ? 0 : 1; + return 0; +} + +/* + * Move all the leaf entries from drop_blk to save_blk. + * This is done as part of a join operation. + */ +void +xfs_dir2_leafn_unbalance( + xfs_da_state_t *state, /* cursor */ + xfs_da_state_blk_t *drop_blk, /* dead block */ + xfs_da_state_blk_t *save_blk) /* surviving block */ +{ + xfs_da_args_t *args; /* operation arguments */ + xfs_dir2_leaf_t *drop_leaf; /* dead leaf structure */ + xfs_dir2_leaf_t *save_leaf; /* surviving leaf structure */ + struct xfs_dir3_icleaf_hdr savehdr; + struct xfs_dir3_icleaf_hdr drophdr; + struct xfs_dir2_leaf_entry *sents; + struct xfs_dir2_leaf_entry *dents; + struct xfs_inode *dp = state->args->dp; + + args = state->args; + ASSERT(drop_blk->magic == XFS_DIR2_LEAFN_MAGIC); + ASSERT(save_blk->magic == XFS_DIR2_LEAFN_MAGIC); + drop_leaf = drop_blk->bp->b_addr; + save_leaf = save_blk->bp->b_addr; + + dp->d_ops->leaf_hdr_from_disk(&savehdr, save_leaf); + dp->d_ops->leaf_hdr_from_disk(&drophdr, drop_leaf); + sents = dp->d_ops->leaf_ents_p(save_leaf); + dents = dp->d_ops->leaf_ents_p(drop_leaf); + + /* + * If there are any stale leaf entries, take this opportunity + * to purge them. + */ + if (drophdr.stale) + xfs_dir3_leaf_compact(args, &drophdr, drop_blk->bp); + if (savehdr.stale) + xfs_dir3_leaf_compact(args, &savehdr, save_blk->bp); + + /* + * Move the entries from drop to the appropriate end of save. + */ + drop_blk->hashval = be32_to_cpu(dents[drophdr.count - 1].hashval); + if (xfs_dir2_leafn_order(dp, save_blk->bp, drop_blk->bp)) + xfs_dir3_leafn_moveents(args, drop_blk->bp, &drophdr, dents, 0, + save_blk->bp, &savehdr, sents, 0, + drophdr.count); + else + xfs_dir3_leafn_moveents(args, drop_blk->bp, &drophdr, dents, 0, + save_blk->bp, &savehdr, sents, + savehdr.count, drophdr.count); + save_blk->hashval = be32_to_cpu(sents[savehdr.count - 1].hashval); + + /* log the changes made when moving the entries */ + dp->d_ops->leaf_hdr_to_disk(save_leaf, &savehdr); + dp->d_ops->leaf_hdr_to_disk(drop_leaf, &drophdr); + xfs_dir3_leaf_log_header(args, save_blk->bp); + xfs_dir3_leaf_log_header(args, drop_blk->bp); + + xfs_dir3_leaf_check(dp, save_blk->bp); + xfs_dir3_leaf_check(dp, drop_blk->bp); +} + +/* + * Top-level node form directory addname routine. + */ +int /* error */ +xfs_dir2_node_addname( + xfs_da_args_t *args) /* operation arguments */ +{ + xfs_da_state_blk_t *blk; /* leaf block for insert */ + int error; /* error return value */ + int rval; /* sub-return value */ + xfs_da_state_t *state; /* btree cursor */ + + trace_xfs_dir2_node_addname(args); + + /* + * Allocate and initialize the state (btree cursor). + */ + state = xfs_da_state_alloc(); + state->args = args; + state->mp = args->dp->i_mount; + /* + * Look up the name. We're not supposed to find it, but + * this gives us the insertion point. + */ + error = xfs_da3_node_lookup_int(state, &rval); + if (error) + rval = error; + if (rval != -ENOENT) { + goto done; + } + /* + * Add the data entry to a data block. + * Extravalid is set to a freeblock found by lookup. + */ + rval = xfs_dir2_node_addname_int(args, + state->extravalid ? &state->extrablk : NULL); + if (rval) { + goto done; + } + blk = &state->path.blk[state->path.active - 1]; + ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC); + /* + * Add the new leaf entry. + */ + rval = xfs_dir2_leafn_add(blk->bp, args, blk->index); + if (rval == 0) { + /* + * It worked, fix the hash values up the btree. + */ + if (!(args->op_flags & XFS_DA_OP_JUSTCHECK)) + xfs_da3_fixhashpath(state, &state->path); + } else { + /* + * It didn't work, we need to split the leaf block. + */ + if (args->total == 0) { + ASSERT(rval == -ENOSPC); + goto done; + } + /* + * Split the leaf block and insert the new entry. + */ + rval = xfs_da3_split(state); + } +done: + xfs_da_state_free(state); + return rval; +} + +/* + * Add the data entry for a node-format directory name addition. + * The leaf entry is added in xfs_dir2_leafn_add. + * We may enter with a freespace block that the lookup found. + */ +static int /* error */ +xfs_dir2_node_addname_int( + xfs_da_args_t *args, /* operation arguments */ + xfs_da_state_blk_t *fblk) /* optional freespace block */ +{ + xfs_dir2_data_hdr_t *hdr; /* data block header */ + xfs_dir2_db_t dbno; /* data block number */ + struct xfs_buf *dbp; /* data block buffer */ + xfs_dir2_data_entry_t *dep; /* data entry pointer */ + xfs_inode_t *dp; /* incore directory inode */ + xfs_dir2_data_unused_t *dup; /* data unused entry pointer */ + int error; /* error return value */ + xfs_dir2_db_t fbno; /* freespace block number */ + struct xfs_buf *fbp; /* freespace buffer */ + int findex; /* freespace entry index */ + xfs_dir2_free_t *free=NULL; /* freespace block structure */ + xfs_dir2_db_t ifbno; /* initial freespace block no */ + xfs_dir2_db_t lastfbno=0; /* highest freespace block no */ + int length; /* length of the new entry */ + int logfree; /* need to log free entry */ + xfs_mount_t *mp; /* filesystem mount point */ + int needlog; /* need to log data header */ + int needscan; /* need to rescan data frees */ + __be16 *tagp; /* data entry tag pointer */ + xfs_trans_t *tp; /* transaction pointer */ + __be16 *bests; + struct xfs_dir3_icfree_hdr freehdr; + struct xfs_dir2_data_free *bf; + + dp = args->dp; + mp = dp->i_mount; + tp = args->trans; + length = dp->d_ops->data_entsize(args->namelen); + /* + * If we came in with a freespace block that means that lookup + * found an entry with our hash value. This is the freespace + * block for that data entry. + */ + if (fblk) { + fbp = fblk->bp; + /* + * Remember initial freespace block number. + */ + ifbno = fblk->blkno; + free = fbp->b_addr; + findex = fblk->index; + bests = dp->d_ops->free_bests_p(free); + dp->d_ops->free_hdr_from_disk(&freehdr, free); + + /* + * This means the free entry showed that the data block had + * space for our entry, so we remembered it. + * Use that data block. + */ + if (findex >= 0) { + ASSERT(findex < freehdr.nvalid); + ASSERT(be16_to_cpu(bests[findex]) != NULLDATAOFF); + ASSERT(be16_to_cpu(bests[findex]) >= length); + dbno = freehdr.firstdb + findex; + } else { + /* + * The data block looked at didn't have enough room. + * We'll start at the beginning of the freespace entries. + */ + dbno = -1; + findex = 0; + } + } else { + /* + * Didn't come in with a freespace block, so no data block. + */ + ifbno = dbno = -1; + fbp = NULL; + findex = 0; + } + + /* + * If we don't have a data block yet, we're going to scan the + * freespace blocks looking for one. Figure out what the + * highest freespace block number is. + */ + if (dbno == -1) { + xfs_fileoff_t fo; /* freespace block number */ + + if ((error = xfs_bmap_last_offset(dp, &fo, XFS_DATA_FORK))) + return error; + lastfbno = xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)fo); + fbno = ifbno; + } + /* + * While we haven't identified a data block, search the freeblock + * data for a good data block. If we find a null freeblock entry, + * indicating a hole in the data blocks, remember that. + */ + while (dbno == -1) { + /* + * If we don't have a freeblock in hand, get the next one. + */ + if (fbp == NULL) { + /* + * Happens the first time through unless lookup gave + * us a freespace block to start with. + */ + if (++fbno == 0) + fbno = xfs_dir2_byte_to_db(args->geo, + XFS_DIR2_FREE_OFFSET); + /* + * If it's ifbno we already looked at it. + */ + if (fbno == ifbno) + fbno++; + /* + * If it's off the end we're done. + */ + if (fbno >= lastfbno) + break; + /* + * Read the block. There can be holes in the + * freespace blocks, so this might not succeed. + * This should be really rare, so there's no reason + * to avoid it. + */ + error = xfs_dir2_free_try_read(tp, dp, + xfs_dir2_db_to_da(args->geo, fbno), + &fbp); + if (error) + return error; + if (!fbp) + continue; + free = fbp->b_addr; + findex = 0; + } + /* + * Look at the current free entry. Is it good enough? + * + * The bests initialisation should be where the bufer is read in + * the above branch. But gcc is too stupid to realise that bests + * and the freehdr are actually initialised if they are placed + * there, so we have to do it here to avoid warnings. Blech. + */ + bests = dp->d_ops->free_bests_p(free); + dp->d_ops->free_hdr_from_disk(&freehdr, free); + if (be16_to_cpu(bests[findex]) != NULLDATAOFF && + be16_to_cpu(bests[findex]) >= length) + dbno = freehdr.firstdb + findex; + else { + /* + * Are we done with the freeblock? + */ + if (++findex == freehdr.nvalid) { + /* + * Drop the block. + */ + xfs_trans_brelse(tp, fbp); + fbp = NULL; + if (fblk && fblk->bp) + fblk->bp = NULL; + } + } + } + /* + * If we don't have a data block, we need to allocate one and make + * the freespace entries refer to it. + */ + if (unlikely(dbno == -1)) { + /* + * Not allowed to allocate, return failure. + */ + if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0) + return -ENOSPC; + + /* + * Allocate and initialize the new data block. + */ + if (unlikely((error = xfs_dir2_grow_inode(args, + XFS_DIR2_DATA_SPACE, + &dbno)) || + (error = xfs_dir3_data_init(args, dbno, &dbp)))) + return error; + + /* + * If (somehow) we have a freespace block, get rid of it. + */ + if (fbp) + xfs_trans_brelse(tp, fbp); + if (fblk && fblk->bp) + fblk->bp = NULL; + + /* + * Get the freespace block corresponding to the data block + * that was just allocated. + */ + fbno = dp->d_ops->db_to_fdb(args->geo, dbno); + error = xfs_dir2_free_try_read(tp, dp, + xfs_dir2_db_to_da(args->geo, fbno), + &fbp); + if (error) + return error; + + /* + * If there wasn't a freespace block, the read will + * return a NULL fbp. Allocate and initialize a new one. + */ + if (!fbp) { + error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE, + &fbno); + if (error) + return error; + + if (dp->d_ops->db_to_fdb(args->geo, dbno) != fbno) { + xfs_alert(mp, + "%s: dir ino %llu needed freesp block %lld for\n" + " data block %lld, got %lld ifbno %llu lastfbno %d", + __func__, (unsigned long long)dp->i_ino, + (long long)dp->d_ops->db_to_fdb( + args->geo, dbno), + (long long)dbno, (long long)fbno, + (unsigned long long)ifbno, lastfbno); + if (fblk) { + xfs_alert(mp, + " fblk 0x%p blkno %llu index %d magic 0x%x", + fblk, + (unsigned long long)fblk->blkno, + fblk->index, + fblk->magic); + } else { + xfs_alert(mp, " ... fblk is NULL"); + } + XFS_ERROR_REPORT("xfs_dir2_node_addname_int", + XFS_ERRLEVEL_LOW, mp); + return -EFSCORRUPTED; + } + + /* + * Get a buffer for the new block. + */ + error = xfs_dir3_free_get_buf(args, fbno, &fbp); + if (error) + return error; + free = fbp->b_addr; + bests = dp->d_ops->free_bests_p(free); + dp->d_ops->free_hdr_from_disk(&freehdr, free); + + /* + * Remember the first slot as our empty slot. + */ + freehdr.firstdb = + (fbno - xfs_dir2_byte_to_db(args->geo, + XFS_DIR2_FREE_OFFSET)) * + dp->d_ops->free_max_bests(args->geo); + } else { + free = fbp->b_addr; + bests = dp->d_ops->free_bests_p(free); + dp->d_ops->free_hdr_from_disk(&freehdr, free); + } + + /* + * Set the freespace block index from the data block number. + */ + findex = dp->d_ops->db_to_fdindex(args->geo, dbno); + /* + * If it's after the end of the current entries in the + * freespace block, extend that table. + */ + if (findex >= freehdr.nvalid) { + ASSERT(findex < dp->d_ops->free_max_bests(args->geo)); + freehdr.nvalid = findex + 1; + /* + * Tag new entry so nused will go up. + */ + bests[findex] = cpu_to_be16(NULLDATAOFF); + } + /* + * If this entry was for an empty data block + * (this should always be true) then update the header. + */ + if (bests[findex] == cpu_to_be16(NULLDATAOFF)) { + freehdr.nused++; + dp->d_ops->free_hdr_to_disk(fbp->b_addr, &freehdr); + xfs_dir2_free_log_header(args, fbp); + } + /* + * Update the real value in the table. + * We haven't allocated the data entry yet so this will + * change again. + */ + hdr = dbp->b_addr; + bf = dp->d_ops->data_bestfree_p(hdr); + bests[findex] = bf[0].length; + logfree = 1; + } + /* + * We had a data block so we don't have to make a new one. + */ + else { + /* + * If just checking, we succeeded. + */ + if (args->op_flags & XFS_DA_OP_JUSTCHECK) + return 0; + + /* + * Read the data block in. + */ + error = xfs_dir3_data_read(tp, dp, + xfs_dir2_db_to_da(args->geo, dbno), + -1, &dbp); + if (error) + return error; + hdr = dbp->b_addr; + bf = dp->d_ops->data_bestfree_p(hdr); + logfree = 0; + } + ASSERT(be16_to_cpu(bf[0].length) >= length); + /* + * Point to the existing unused space. + */ + dup = (xfs_dir2_data_unused_t *) + ((char *)hdr + be16_to_cpu(bf[0].offset)); + needscan = needlog = 0; + /* + * Mark the first part of the unused space, inuse for us. + */ + xfs_dir2_data_use_free(args, dbp, dup, + (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr), length, + &needlog, &needscan); + /* + * Fill in the new entry and log it. + */ + dep = (xfs_dir2_data_entry_t *)dup; + dep->inumber = cpu_to_be64(args->inumber); + dep->namelen = args->namelen; + memcpy(dep->name, args->name, dep->namelen); + dp->d_ops->data_put_ftype(dep, args->filetype); + tagp = dp->d_ops->data_entry_tag_p(dep); + *tagp = cpu_to_be16((char *)dep - (char *)hdr); + xfs_dir2_data_log_entry(args, dbp, dep); + /* + * Rescan the block for bestfree if needed. + */ + if (needscan) + xfs_dir2_data_freescan(dp, hdr, &needlog); + /* + * Log the data block header if needed. + */ + if (needlog) + xfs_dir2_data_log_header(args, dbp); + /* + * If the freespace entry is now wrong, update it. + */ + bests = dp->d_ops->free_bests_p(free); /* gcc is so stupid */ + if (be16_to_cpu(bests[findex]) != be16_to_cpu(bf[0].length)) { + bests[findex] = bf[0].length; + logfree = 1; + } + /* + * Log the freespace entry if needed. + */ + if (logfree) + xfs_dir2_free_log_bests(args, fbp, findex, findex); + /* + * Return the data block and offset in args, then drop the data block. + */ + args->blkno = (xfs_dablk_t)dbno; + args->index = be16_to_cpu(*tagp); + return 0; +} + +/* + * Lookup an entry in a node-format directory. + * All the real work happens in xfs_da3_node_lookup_int. + * The only real output is the inode number of the entry. + */ +int /* error */ +xfs_dir2_node_lookup( + xfs_da_args_t *args) /* operation arguments */ +{ + int error; /* error return value */ + int i; /* btree level */ + int rval; /* operation return value */ + xfs_da_state_t *state; /* btree cursor */ + + trace_xfs_dir2_node_lookup(args); + + /* + * Allocate and initialize the btree cursor. + */ + state = xfs_da_state_alloc(); + state->args = args; + state->mp = args->dp->i_mount; + /* + * Fill in the path to the entry in the cursor. + */ + error = xfs_da3_node_lookup_int(state, &rval); + if (error) + rval = error; + else if (rval == -ENOENT && args->cmpresult == XFS_CMP_CASE) { + /* If a CI match, dup the actual name and return -EEXIST */ + xfs_dir2_data_entry_t *dep; + + dep = (xfs_dir2_data_entry_t *) + ((char *)state->extrablk.bp->b_addr + + state->extrablk.index); + rval = xfs_dir_cilookup_result(args, dep->name, dep->namelen); + } + /* + * Release the btree blocks and leaf block. + */ + for (i = 0; i < state->path.active; i++) { + xfs_trans_brelse(args->trans, state->path.blk[i].bp); + state->path.blk[i].bp = NULL; + } + /* + * Release the data block if we have it. + */ + if (state->extravalid && state->extrablk.bp) { + xfs_trans_brelse(args->trans, state->extrablk.bp); + state->extrablk.bp = NULL; + } + xfs_da_state_free(state); + return rval; +} + +/* + * Remove an entry from a node-format directory. + */ +int /* error */ +xfs_dir2_node_removename( + struct xfs_da_args *args) /* operation arguments */ +{ + struct xfs_da_state_blk *blk; /* leaf block */ + int error; /* error return value */ + int rval; /* operation return value */ + struct xfs_da_state *state; /* btree cursor */ + + trace_xfs_dir2_node_removename(args); + + /* + * Allocate and initialize the btree cursor. + */ + state = xfs_da_state_alloc(); + state->args = args; + state->mp = args->dp->i_mount; + + /* Look up the entry we're deleting, set up the cursor. */ + error = xfs_da3_node_lookup_int(state, &rval); + if (error) + goto out_free; + + /* Didn't find it, upper layer screwed up. */ + if (rval != -EEXIST) { + error = rval; + goto out_free; + } + + blk = &state->path.blk[state->path.active - 1]; + ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC); + ASSERT(state->extravalid); + /* + * Remove the leaf and data entries. + * Extrablk refers to the data block. + */ + error = xfs_dir2_leafn_remove(args, blk->bp, blk->index, + &state->extrablk, &rval); + if (error) + goto out_free; + /* + * Fix the hash values up the btree. + */ + xfs_da3_fixhashpath(state, &state->path); + /* + * If we need to join leaf blocks, do it. + */ + if (rval && state->path.active > 1) + error = xfs_da3_join(state); + /* + * If no errors so far, try conversion to leaf format. + */ + if (!error) + error = xfs_dir2_node_to_leaf(state); +out_free: + xfs_da_state_free(state); + return error; +} + +/* + * Replace an entry's inode number in a node-format directory. + */ +int /* error */ +xfs_dir2_node_replace( + xfs_da_args_t *args) /* operation arguments */ +{ + xfs_da_state_blk_t *blk; /* leaf block */ + xfs_dir2_data_hdr_t *hdr; /* data block header */ + xfs_dir2_data_entry_t *dep; /* data entry changed */ + int error; /* error return value */ + int i; /* btree level */ + xfs_ino_t inum; /* new inode number */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + xfs_dir2_leaf_entry_t *lep; /* leaf entry being changed */ + int rval; /* internal return value */ + xfs_da_state_t *state; /* btree cursor */ + + trace_xfs_dir2_node_replace(args); + + /* + * Allocate and initialize the btree cursor. + */ + state = xfs_da_state_alloc(); + state->args = args; + state->mp = args->dp->i_mount; + inum = args->inumber; + /* + * Lookup the entry to change in the btree. + */ + error = xfs_da3_node_lookup_int(state, &rval); + if (error) { + rval = error; + } + /* + * It should be found, since the vnodeops layer has looked it up + * and locked it. But paranoia is good. + */ + if (rval == -EEXIST) { + struct xfs_dir2_leaf_entry *ents; + /* + * Find the leaf entry. + */ + blk = &state->path.blk[state->path.active - 1]; + ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC); + leaf = blk->bp->b_addr; + ents = args->dp->d_ops->leaf_ents_p(leaf); + lep = &ents[blk->index]; + ASSERT(state->extravalid); + /* + * Point to the data entry. + */ + hdr = state->extrablk.bp->b_addr; + ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC)); + dep = (xfs_dir2_data_entry_t *) + ((char *)hdr + + xfs_dir2_dataptr_to_off(args->geo, + be32_to_cpu(lep->address))); + ASSERT(inum != be64_to_cpu(dep->inumber)); + /* + * Fill in the new inode number and log the entry. + */ + dep->inumber = cpu_to_be64(inum); + args->dp->d_ops->data_put_ftype(dep, args->filetype); + xfs_dir2_data_log_entry(args, state->extrablk.bp, dep); + rval = 0; + } + /* + * Didn't find it, and we're holding a data block. Drop it. + */ + else if (state->extravalid) { + xfs_trans_brelse(args->trans, state->extrablk.bp); + state->extrablk.bp = NULL; + } + /* + * Release all the buffers in the cursor. + */ + for (i = 0; i < state->path.active; i++) { + xfs_trans_brelse(args->trans, state->path.blk[i].bp); + state->path.blk[i].bp = NULL; + } + xfs_da_state_free(state); + return rval; +} + +/* + * Trim off a trailing empty freespace block. + * Return (in rvalp) 1 if we did it, 0 if not. + */ +int /* error */ +xfs_dir2_node_trim_free( + xfs_da_args_t *args, /* operation arguments */ + xfs_fileoff_t fo, /* free block number */ + int *rvalp) /* out: did something */ +{ + struct xfs_buf *bp; /* freespace buffer */ + xfs_inode_t *dp; /* incore directory inode */ + int error; /* error return code */ + xfs_dir2_free_t *free; /* freespace structure */ + xfs_trans_t *tp; /* transaction pointer */ + struct xfs_dir3_icfree_hdr freehdr; + + dp = args->dp; + tp = args->trans; + /* + * Read the freespace block. + */ + error = xfs_dir2_free_try_read(tp, dp, fo, &bp); + if (error) + return error; + /* + * There can be holes in freespace. If fo is a hole, there's + * nothing to do. + */ + if (!bp) + return 0; + free = bp->b_addr; + dp->d_ops->free_hdr_from_disk(&freehdr, free); + + /* + * If there are used entries, there's nothing to do. + */ + if (freehdr.nused > 0) { + xfs_trans_brelse(tp, bp); + *rvalp = 0; + return 0; + } + /* + * Blow the block away. + */ + error = xfs_dir2_shrink_inode(args, + xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)fo), bp); + if (error) { + /* + * Can't fail with ENOSPC since that only happens with no + * space reservation, when breaking up an extent into two + * pieces. This is the last block of an extent. + */ + ASSERT(error != -ENOSPC); + xfs_trans_brelse(tp, bp); + return error; + } + /* + * Return that we succeeded. + */ + *rvalp = 1; + return 0; +} diff --git a/kernel/fs/xfs/libxfs/xfs_dir2_priv.h b/kernel/fs/xfs/libxfs/xfs_dir2_priv.h new file mode 100644 index 000000000..ef9f6ead9 --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_dir2_priv.h @@ -0,0 +1,134 @@ +/* + * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_DIR2_PRIV_H__ +#define __XFS_DIR2_PRIV_H__ + +struct dir_context; + +/* xfs_dir2.c */ +extern int xfs_dir_ino_validate(struct xfs_mount *mp, xfs_ino_t ino); +extern int xfs_dir2_grow_inode(struct xfs_da_args *args, int space, + xfs_dir2_db_t *dbp); +extern int xfs_dir_cilookup_result(struct xfs_da_args *args, + const unsigned char *name, int len); + + +/* xfs_dir2_block.c */ +extern int xfs_dir3_block_read(struct xfs_trans *tp, struct xfs_inode *dp, + struct xfs_buf **bpp); +extern int xfs_dir2_block_addname(struct xfs_da_args *args); +extern int xfs_dir2_block_lookup(struct xfs_da_args *args); +extern int xfs_dir2_block_removename(struct xfs_da_args *args); +extern int xfs_dir2_block_replace(struct xfs_da_args *args); +extern int xfs_dir2_leaf_to_block(struct xfs_da_args *args, + struct xfs_buf *lbp, struct xfs_buf *dbp); + +/* xfs_dir2_data.c */ +#ifdef DEBUG +#define xfs_dir3_data_check(dp,bp) __xfs_dir3_data_check(dp, bp); +#else +#define xfs_dir3_data_check(dp,bp) +#endif + +extern int __xfs_dir3_data_check(struct xfs_inode *dp, struct xfs_buf *bp); +extern int xfs_dir3_data_read(struct xfs_trans *tp, struct xfs_inode *dp, + xfs_dablk_t bno, xfs_daddr_t mapped_bno, struct xfs_buf **bpp); +extern int xfs_dir3_data_readahead(struct xfs_inode *dp, xfs_dablk_t bno, + xfs_daddr_t mapped_bno); + +extern struct xfs_dir2_data_free * +xfs_dir2_data_freeinsert(struct xfs_dir2_data_hdr *hdr, + struct xfs_dir2_data_free *bf, struct xfs_dir2_data_unused *dup, + int *loghead); +extern int xfs_dir3_data_init(struct xfs_da_args *args, xfs_dir2_db_t blkno, + struct xfs_buf **bpp); + +/* xfs_dir2_leaf.c */ +extern int xfs_dir3_leafn_read(struct xfs_trans *tp, struct xfs_inode *dp, + xfs_dablk_t fbno, xfs_daddr_t mappedbno, struct xfs_buf **bpp); +extern int xfs_dir2_block_to_leaf(struct xfs_da_args *args, + struct xfs_buf *dbp); +extern int xfs_dir2_leaf_addname(struct xfs_da_args *args); +extern void xfs_dir3_leaf_compact(struct xfs_da_args *args, + struct xfs_dir3_icleaf_hdr *leafhdr, struct xfs_buf *bp); +extern void xfs_dir3_leaf_compact_x1(struct xfs_dir3_icleaf_hdr *leafhdr, + struct xfs_dir2_leaf_entry *ents, int *indexp, + int *lowstalep, int *highstalep, int *lowlogp, int *highlogp); +extern int xfs_dir3_leaf_get_buf(struct xfs_da_args *args, xfs_dir2_db_t bno, + struct xfs_buf **bpp, __uint16_t magic); +extern void xfs_dir3_leaf_log_ents(struct xfs_da_args *args, + struct xfs_buf *bp, int first, int last); +extern void xfs_dir3_leaf_log_header(struct xfs_da_args *args, + struct xfs_buf *bp); +extern int xfs_dir2_leaf_lookup(struct xfs_da_args *args); +extern int xfs_dir2_leaf_removename(struct xfs_da_args *args); +extern int xfs_dir2_leaf_replace(struct xfs_da_args *args); +extern int xfs_dir2_leaf_search_hash(struct xfs_da_args *args, + struct xfs_buf *lbp); +extern int xfs_dir2_leaf_trim_data(struct xfs_da_args *args, + struct xfs_buf *lbp, xfs_dir2_db_t db); +extern struct xfs_dir2_leaf_entry * +xfs_dir3_leaf_find_entry(struct xfs_dir3_icleaf_hdr *leafhdr, + struct xfs_dir2_leaf_entry *ents, int index, int compact, + int lowstale, int highstale, int *lfloglow, int *lfloghigh); +extern int xfs_dir2_node_to_leaf(struct xfs_da_state *state); + +extern bool xfs_dir3_leaf_check_int(struct xfs_mount *mp, struct xfs_inode *dp, + struct xfs_dir3_icleaf_hdr *hdr, struct xfs_dir2_leaf *leaf); + +/* xfs_dir2_node.c */ +extern int xfs_dir2_leaf_to_node(struct xfs_da_args *args, + struct xfs_buf *lbp); +extern xfs_dahash_t xfs_dir2_leafn_lasthash(struct xfs_inode *dp, + struct xfs_buf *bp, int *count); +extern int xfs_dir2_leafn_lookup_int(struct xfs_buf *bp, + struct xfs_da_args *args, int *indexp, + struct xfs_da_state *state); +extern int xfs_dir2_leafn_order(struct xfs_inode *dp, struct xfs_buf *leaf1_bp, + struct xfs_buf *leaf2_bp); +extern int xfs_dir2_leafn_split(struct xfs_da_state *state, + struct xfs_da_state_blk *oldblk, struct xfs_da_state_blk *newblk); +extern int xfs_dir2_leafn_toosmall(struct xfs_da_state *state, int *action); +extern void xfs_dir2_leafn_unbalance(struct xfs_da_state *state, + struct xfs_da_state_blk *drop_blk, + struct xfs_da_state_blk *save_blk); +extern int xfs_dir2_node_addname(struct xfs_da_args *args); +extern int xfs_dir2_node_lookup(struct xfs_da_args *args); +extern int xfs_dir2_node_removename(struct xfs_da_args *args); +extern int xfs_dir2_node_replace(struct xfs_da_args *args); +extern int xfs_dir2_node_trim_free(struct xfs_da_args *args, xfs_fileoff_t fo, + int *rvalp); +extern int xfs_dir2_free_read(struct xfs_trans *tp, struct xfs_inode *dp, + xfs_dablk_t fbno, struct xfs_buf **bpp); + +/* xfs_dir2_sf.c */ +extern int xfs_dir2_block_sfsize(struct xfs_inode *dp, + struct xfs_dir2_data_hdr *block, struct xfs_dir2_sf_hdr *sfhp); +extern int xfs_dir2_block_to_sf(struct xfs_da_args *args, struct xfs_buf *bp, + int size, xfs_dir2_sf_hdr_t *sfhp); +extern int xfs_dir2_sf_addname(struct xfs_da_args *args); +extern int xfs_dir2_sf_create(struct xfs_da_args *args, xfs_ino_t pino); +extern int xfs_dir2_sf_lookup(struct xfs_da_args *args); +extern int xfs_dir2_sf_removename(struct xfs_da_args *args); +extern int xfs_dir2_sf_replace(struct xfs_da_args *args); + +/* xfs_dir2_readdir.c */ +extern int xfs_readdir(struct xfs_inode *dp, struct dir_context *ctx, + size_t bufsize); + +#endif /* __XFS_DIR2_PRIV_H__ */ diff --git a/kernel/fs/xfs/libxfs/xfs_dir2_sf.c b/kernel/fs/xfs/libxfs/xfs_dir2_sf.c new file mode 100644 index 000000000..974d62e67 --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_dir2_sf.c @@ -0,0 +1,1142 @@ +/* + * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_inode_item.h" +#include "xfs_error.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" +#include "xfs_trace.h" + +/* + * Prototypes for internal functions. + */ +static void xfs_dir2_sf_addname_easy(xfs_da_args_t *args, + xfs_dir2_sf_entry_t *sfep, + xfs_dir2_data_aoff_t offset, + int new_isize); +static void xfs_dir2_sf_addname_hard(xfs_da_args_t *args, int objchange, + int new_isize); +static int xfs_dir2_sf_addname_pick(xfs_da_args_t *args, int objchange, + xfs_dir2_sf_entry_t **sfepp, + xfs_dir2_data_aoff_t *offsetp); +#ifdef DEBUG +static void xfs_dir2_sf_check(xfs_da_args_t *args); +#else +#define xfs_dir2_sf_check(args) +#endif /* DEBUG */ + +static void xfs_dir2_sf_toino4(xfs_da_args_t *args); +static void xfs_dir2_sf_toino8(xfs_da_args_t *args); + +/* + * Given a block directory (dp/block), calculate its size as a shortform (sf) + * directory and a header for the sf directory, if it will fit it the + * space currently present in the inode. If it won't fit, the output + * size is too big (but not accurate). + */ +int /* size for sf form */ +xfs_dir2_block_sfsize( + xfs_inode_t *dp, /* incore inode pointer */ + xfs_dir2_data_hdr_t *hdr, /* block directory data */ + xfs_dir2_sf_hdr_t *sfhp) /* output: header for sf form */ +{ + xfs_dir2_dataptr_t addr; /* data entry address */ + xfs_dir2_leaf_entry_t *blp; /* leaf area of the block */ + xfs_dir2_block_tail_t *btp; /* tail area of the block */ + int count; /* shortform entry count */ + xfs_dir2_data_entry_t *dep; /* data entry in the block */ + int i; /* block entry index */ + int i8count; /* count of big-inode entries */ + int isdot; /* entry is "." */ + int isdotdot; /* entry is ".." */ + xfs_mount_t *mp; /* mount structure pointer */ + int namelen; /* total name bytes */ + xfs_ino_t parent = 0; /* parent inode number */ + int size=0; /* total computed size */ + int has_ftype; + struct xfs_da_geometry *geo; + + mp = dp->i_mount; + geo = mp->m_dir_geo; + + /* + * if there is a filetype field, add the extra byte to the namelen + * for each entry that we see. + */ + has_ftype = xfs_sb_version_hasftype(&mp->m_sb) ? 1 : 0; + + count = i8count = namelen = 0; + btp = xfs_dir2_block_tail_p(geo, hdr); + blp = xfs_dir2_block_leaf_p(btp); + + /* + * Iterate over the block's data entries by using the leaf pointers. + */ + for (i = 0; i < be32_to_cpu(btp->count); i++) { + if ((addr = be32_to_cpu(blp[i].address)) == XFS_DIR2_NULL_DATAPTR) + continue; + /* + * Calculate the pointer to the entry at hand. + */ + dep = (xfs_dir2_data_entry_t *)((char *)hdr + + xfs_dir2_dataptr_to_off(geo, addr)); + /* + * Detect . and .., so we can special-case them. + * . is not included in sf directories. + * .. is included by just the parent inode number. + */ + isdot = dep->namelen == 1 && dep->name[0] == '.'; + isdotdot = + dep->namelen == 2 && + dep->name[0] == '.' && dep->name[1] == '.'; + + if (!isdot) + i8count += be64_to_cpu(dep->inumber) > XFS_DIR2_MAX_SHORT_INUM; + + /* take into account the file type field */ + if (!isdot && !isdotdot) { + count++; + namelen += dep->namelen + has_ftype; + } else if (isdotdot) + parent = be64_to_cpu(dep->inumber); + /* + * Calculate the new size, see if we should give up yet. + */ + size = xfs_dir2_sf_hdr_size(i8count) + /* header */ + count + /* namelen */ + count * (uint)sizeof(xfs_dir2_sf_off_t) + /* offset */ + namelen + /* name */ + (i8count ? /* inumber */ + (uint)sizeof(xfs_dir2_ino8_t) * count : + (uint)sizeof(xfs_dir2_ino4_t) * count); + if (size > XFS_IFORK_DSIZE(dp)) + return size; /* size value is a failure */ + } + /* + * Create the output header, if it worked. + */ + sfhp->count = count; + sfhp->i8count = i8count; + dp->d_ops->sf_put_parent_ino(sfhp, parent); + return size; +} + +/* + * Convert a block format directory to shortform. + * Caller has already checked that it will fit, and built us a header. + */ +int /* error */ +xfs_dir2_block_to_sf( + xfs_da_args_t *args, /* operation arguments */ + struct xfs_buf *bp, + int size, /* shortform directory size */ + xfs_dir2_sf_hdr_t *sfhp) /* shortform directory hdr */ +{ + xfs_dir2_data_hdr_t *hdr; /* block header */ + xfs_dir2_block_tail_t *btp; /* block tail pointer */ + xfs_dir2_data_entry_t *dep; /* data entry pointer */ + xfs_inode_t *dp; /* incore directory inode */ + xfs_dir2_data_unused_t *dup; /* unused data pointer */ + char *endptr; /* end of data entries */ + int error; /* error return value */ + int logflags; /* inode logging flags */ + xfs_mount_t *mp; /* filesystem mount point */ + char *ptr; /* current data pointer */ + xfs_dir2_sf_entry_t *sfep; /* shortform entry */ + xfs_dir2_sf_hdr_t *sfp; /* shortform directory header */ + xfs_dir2_sf_hdr_t *dst; /* temporary data buffer */ + + trace_xfs_dir2_block_to_sf(args); + + dp = args->dp; + mp = dp->i_mount; + + /* + * allocate a temporary destination buffer the size of the inode + * to format the data into. Once we have formatted the data, we + * can free the block and copy the formatted data into the inode literal + * area. + */ + dst = kmem_alloc(mp->m_sb.sb_inodesize, KM_SLEEP); + hdr = bp->b_addr; + + /* + * Copy the header into the newly allocate local space. + */ + sfp = (xfs_dir2_sf_hdr_t *)dst; + memcpy(sfp, sfhp, xfs_dir2_sf_hdr_size(sfhp->i8count)); + + /* + * Set up to loop over the block's entries. + */ + btp = xfs_dir2_block_tail_p(args->geo, hdr); + ptr = (char *)dp->d_ops->data_entry_p(hdr); + endptr = (char *)xfs_dir2_block_leaf_p(btp); + sfep = xfs_dir2_sf_firstentry(sfp); + /* + * Loop over the active and unused entries. + * Stop when we reach the leaf/tail portion of the block. + */ + while (ptr < endptr) { + /* + * If it's unused, just skip over it. + */ + dup = (xfs_dir2_data_unused_t *)ptr; + if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { + ptr += be16_to_cpu(dup->length); + continue; + } + dep = (xfs_dir2_data_entry_t *)ptr; + /* + * Skip . + */ + if (dep->namelen == 1 && dep->name[0] == '.') + ASSERT(be64_to_cpu(dep->inumber) == dp->i_ino); + /* + * Skip .., but make sure the inode number is right. + */ + else if (dep->namelen == 2 && + dep->name[0] == '.' && dep->name[1] == '.') + ASSERT(be64_to_cpu(dep->inumber) == + dp->d_ops->sf_get_parent_ino(sfp)); + /* + * Normal entry, copy it into shortform. + */ + else { + sfep->namelen = dep->namelen; + xfs_dir2_sf_put_offset(sfep, + (xfs_dir2_data_aoff_t) + ((char *)dep - (char *)hdr)); + memcpy(sfep->name, dep->name, dep->namelen); + dp->d_ops->sf_put_ino(sfp, sfep, + be64_to_cpu(dep->inumber)); + dp->d_ops->sf_put_ftype(sfep, + dp->d_ops->data_get_ftype(dep)); + + sfep = dp->d_ops->sf_nextentry(sfp, sfep); + } + ptr += dp->d_ops->data_entsize(dep->namelen); + } + ASSERT((char *)sfep - (char *)sfp == size); + + /* now we are done with the block, we can shrink the inode */ + logflags = XFS_ILOG_CORE; + error = xfs_dir2_shrink_inode(args, args->geo->datablk, bp); + if (error) { + ASSERT(error != -ENOSPC); + goto out; + } + + /* + * The buffer is now unconditionally gone, whether + * xfs_dir2_shrink_inode worked or not. + * + * Convert the inode to local format and copy the data in. + */ + dp->i_df.if_flags &= ~XFS_IFEXTENTS; + dp->i_df.if_flags |= XFS_IFINLINE; + dp->i_d.di_format = XFS_DINODE_FMT_LOCAL; + ASSERT(dp->i_df.if_bytes == 0); + xfs_idata_realloc(dp, size, XFS_DATA_FORK); + + logflags |= XFS_ILOG_DDATA; + memcpy(dp->i_df.if_u1.if_data, dst, size); + dp->i_d.di_size = size; + xfs_dir2_sf_check(args); +out: + xfs_trans_log_inode(args->trans, dp, logflags); + kmem_free(dst); + return error; +} + +/* + * Add a name to a shortform directory. + * There are two algorithms, "easy" and "hard" which we decide on + * before changing anything. + * Convert to block form if necessary, if the new entry won't fit. + */ +int /* error */ +xfs_dir2_sf_addname( + xfs_da_args_t *args) /* operation arguments */ +{ + xfs_inode_t *dp; /* incore directory inode */ + int error; /* error return value */ + int incr_isize; /* total change in size */ + int new_isize; /* di_size after adding name */ + int objchange; /* changing to 8-byte inodes */ + xfs_dir2_data_aoff_t offset = 0; /* offset for new entry */ + int pick; /* which algorithm to use */ + xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ + xfs_dir2_sf_entry_t *sfep = NULL; /* shortform entry */ + + trace_xfs_dir2_sf_addname(args); + + ASSERT(xfs_dir2_sf_lookup(args) == -ENOENT); + dp = args->dp; + ASSERT(dp->i_df.if_flags & XFS_IFINLINE); + /* + * Make sure the shortform value has some of its header. + */ + if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) { + ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount)); + return -EIO; + } + ASSERT(dp->i_df.if_bytes == dp->i_d.di_size); + ASSERT(dp->i_df.if_u1.if_data != NULL); + sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->i8count)); + /* + * Compute entry (and change in) size. + */ + incr_isize = dp->d_ops->sf_entsize(sfp, args->namelen); + objchange = 0; + + /* + * Do we have to change to 8 byte inodes? + */ + if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && sfp->i8count == 0) { + /* + * Yes, adjust the inode size. old count + (parent + new) + */ + incr_isize += + (sfp->count + 2) * + ((uint)sizeof(xfs_dir2_ino8_t) - + (uint)sizeof(xfs_dir2_ino4_t)); + objchange = 1; + } + + new_isize = (int)dp->i_d.di_size + incr_isize; + /* + * Won't fit as shortform any more (due to size), + * or the pick routine says it won't (due to offset values). + */ + if (new_isize > XFS_IFORK_DSIZE(dp) || + (pick = + xfs_dir2_sf_addname_pick(args, objchange, &sfep, &offset)) == 0) { + /* + * Just checking or no space reservation, it doesn't fit. + */ + if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0) + return -ENOSPC; + /* + * Convert to block form then add the name. + */ + error = xfs_dir2_sf_to_block(args); + if (error) + return error; + return xfs_dir2_block_addname(args); + } + /* + * Just checking, it fits. + */ + if (args->op_flags & XFS_DA_OP_JUSTCHECK) + return 0; + /* + * Do it the easy way - just add it at the end. + */ + if (pick == 1) + xfs_dir2_sf_addname_easy(args, sfep, offset, new_isize); + /* + * Do it the hard way - look for a place to insert the new entry. + * Convert to 8 byte inode numbers first if necessary. + */ + else { + ASSERT(pick == 2); + if (objchange) + xfs_dir2_sf_toino8(args); + xfs_dir2_sf_addname_hard(args, objchange, new_isize); + } + xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA); + return 0; +} + +/* + * Add the new entry the "easy" way. + * This is copying the old directory and adding the new entry at the end. + * Since it's sorted by "offset" we need room after the last offset + * that's already there, and then room to convert to a block directory. + * This is already checked by the pick routine. + */ +static void +xfs_dir2_sf_addname_easy( + xfs_da_args_t *args, /* operation arguments */ + xfs_dir2_sf_entry_t *sfep, /* pointer to new entry */ + xfs_dir2_data_aoff_t offset, /* offset to use for new ent */ + int new_isize) /* new directory size */ +{ + int byteoff; /* byte offset in sf dir */ + xfs_inode_t *dp; /* incore directory inode */ + xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ + + dp = args->dp; + + sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + byteoff = (int)((char *)sfep - (char *)sfp); + /* + * Grow the in-inode space. + */ + xfs_idata_realloc(dp, dp->d_ops->sf_entsize(sfp, args->namelen), + XFS_DATA_FORK); + /* + * Need to set up again due to realloc of the inode data. + */ + sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + sfep = (xfs_dir2_sf_entry_t *)((char *)sfp + byteoff); + /* + * Fill in the new entry. + */ + sfep->namelen = args->namelen; + xfs_dir2_sf_put_offset(sfep, offset); + memcpy(sfep->name, args->name, sfep->namelen); + dp->d_ops->sf_put_ino(sfp, sfep, args->inumber); + dp->d_ops->sf_put_ftype(sfep, args->filetype); + + /* + * Update the header and inode. + */ + sfp->count++; + if (args->inumber > XFS_DIR2_MAX_SHORT_INUM) + sfp->i8count++; + dp->i_d.di_size = new_isize; + xfs_dir2_sf_check(args); +} + +/* + * Add the new entry the "hard" way. + * The caller has already converted to 8 byte inode numbers if necessary, + * in which case we need to leave the i8count at 1. + * Find a hole that the new entry will fit into, and copy + * the first part of the entries, the new entry, and the last part of + * the entries. + */ +/* ARGSUSED */ +static void +xfs_dir2_sf_addname_hard( + xfs_da_args_t *args, /* operation arguments */ + int objchange, /* changing inode number size */ + int new_isize) /* new directory size */ +{ + int add_datasize; /* data size need for new ent */ + char *buf; /* buffer for old */ + xfs_inode_t *dp; /* incore directory inode */ + int eof; /* reached end of old dir */ + int nbytes; /* temp for byte copies */ + xfs_dir2_data_aoff_t new_offset; /* next offset value */ + xfs_dir2_data_aoff_t offset; /* current offset value */ + int old_isize; /* previous di_size */ + xfs_dir2_sf_entry_t *oldsfep; /* entry in original dir */ + xfs_dir2_sf_hdr_t *oldsfp; /* original shortform dir */ + xfs_dir2_sf_entry_t *sfep; /* entry in new dir */ + xfs_dir2_sf_hdr_t *sfp; /* new shortform dir */ + + /* + * Copy the old directory to the stack buffer. + */ + dp = args->dp; + + sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + old_isize = (int)dp->i_d.di_size; + buf = kmem_alloc(old_isize, KM_SLEEP); + oldsfp = (xfs_dir2_sf_hdr_t *)buf; + memcpy(oldsfp, sfp, old_isize); + /* + * Loop over the old directory finding the place we're going + * to insert the new entry. + * If it's going to end up at the end then oldsfep will point there. + */ + for (offset = dp->d_ops->data_first_offset, + oldsfep = xfs_dir2_sf_firstentry(oldsfp), + add_datasize = dp->d_ops->data_entsize(args->namelen), + eof = (char *)oldsfep == &buf[old_isize]; + !eof; + offset = new_offset + dp->d_ops->data_entsize(oldsfep->namelen), + oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep), + eof = (char *)oldsfep == &buf[old_isize]) { + new_offset = xfs_dir2_sf_get_offset(oldsfep); + if (offset + add_datasize <= new_offset) + break; + } + /* + * Get rid of the old directory, then allocate space for + * the new one. We do this so xfs_idata_realloc won't copy + * the data. + */ + xfs_idata_realloc(dp, -old_isize, XFS_DATA_FORK); + xfs_idata_realloc(dp, new_isize, XFS_DATA_FORK); + /* + * Reset the pointer since the buffer was reallocated. + */ + sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + /* + * Copy the first part of the directory, including the header. + */ + nbytes = (int)((char *)oldsfep - (char *)oldsfp); + memcpy(sfp, oldsfp, nbytes); + sfep = (xfs_dir2_sf_entry_t *)((char *)sfp + nbytes); + /* + * Fill in the new entry, and update the header counts. + */ + sfep->namelen = args->namelen; + xfs_dir2_sf_put_offset(sfep, offset); + memcpy(sfep->name, args->name, sfep->namelen); + dp->d_ops->sf_put_ino(sfp, sfep, args->inumber); + dp->d_ops->sf_put_ftype(sfep, args->filetype); + sfp->count++; + if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && !objchange) + sfp->i8count++; + /* + * If there's more left to copy, do that. + */ + if (!eof) { + sfep = dp->d_ops->sf_nextentry(sfp, sfep); + memcpy(sfep, oldsfep, old_isize - nbytes); + } + kmem_free(buf); + dp->i_d.di_size = new_isize; + xfs_dir2_sf_check(args); +} + +/* + * Decide if the new entry will fit at all. + * If it will fit, pick between adding the new entry to the end (easy) + * or somewhere else (hard). + * Return 0 (won't fit), 1 (easy), 2 (hard). + */ +/*ARGSUSED*/ +static int /* pick result */ +xfs_dir2_sf_addname_pick( + xfs_da_args_t *args, /* operation arguments */ + int objchange, /* inode # size changes */ + xfs_dir2_sf_entry_t **sfepp, /* out(1): new entry ptr */ + xfs_dir2_data_aoff_t *offsetp) /* out(1): new offset */ +{ + xfs_inode_t *dp; /* incore directory inode */ + int holefit; /* found hole it will fit in */ + int i; /* entry number */ + xfs_dir2_data_aoff_t offset; /* data block offset */ + xfs_dir2_sf_entry_t *sfep; /* shortform entry */ + xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ + int size; /* entry's data size */ + int used; /* data bytes used */ + + dp = args->dp; + + sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + size = dp->d_ops->data_entsize(args->namelen); + offset = dp->d_ops->data_first_offset; + sfep = xfs_dir2_sf_firstentry(sfp); + holefit = 0; + /* + * Loop over sf entries. + * Keep track of data offset and whether we've seen a place + * to insert the new entry. + */ + for (i = 0; i < sfp->count; i++) { + if (!holefit) + holefit = offset + size <= xfs_dir2_sf_get_offset(sfep); + offset = xfs_dir2_sf_get_offset(sfep) + + dp->d_ops->data_entsize(sfep->namelen); + sfep = dp->d_ops->sf_nextentry(sfp, sfep); + } + /* + * Calculate data bytes used excluding the new entry, if this + * was a data block (block form directory). + */ + used = offset + + (sfp->count + 3) * (uint)sizeof(xfs_dir2_leaf_entry_t) + + (uint)sizeof(xfs_dir2_block_tail_t); + /* + * If it won't fit in a block form then we can't insert it, + * we'll go back, convert to block, then try the insert and convert + * to leaf. + */ + if (used + (holefit ? 0 : size) > args->geo->blksize) + return 0; + /* + * If changing the inode number size, do it the hard way. + */ + if (objchange) + return 2; + /* + * If it won't fit at the end then do it the hard way (use the hole). + */ + if (used + size > args->geo->blksize) + return 2; + /* + * Do it the easy way. + */ + *sfepp = sfep; + *offsetp = offset; + return 1; +} + +#ifdef DEBUG +/* + * Check consistency of shortform directory, assert if bad. + */ +static void +xfs_dir2_sf_check( + xfs_da_args_t *args) /* operation arguments */ +{ + xfs_inode_t *dp; /* incore directory inode */ + int i; /* entry number */ + int i8count; /* number of big inode#s */ + xfs_ino_t ino; /* entry inode number */ + int offset; /* data offset */ + xfs_dir2_sf_entry_t *sfep; /* shortform dir entry */ + xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ + + dp = args->dp; + + sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + offset = dp->d_ops->data_first_offset; + ino = dp->d_ops->sf_get_parent_ino(sfp); + i8count = ino > XFS_DIR2_MAX_SHORT_INUM; + + for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); + i < sfp->count; + i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) { + ASSERT(xfs_dir2_sf_get_offset(sfep) >= offset); + ino = dp->d_ops->sf_get_ino(sfp, sfep); + i8count += ino > XFS_DIR2_MAX_SHORT_INUM; + offset = + xfs_dir2_sf_get_offset(sfep) + + dp->d_ops->data_entsize(sfep->namelen); + ASSERT(dp->d_ops->sf_get_ftype(sfep) < XFS_DIR3_FT_MAX); + } + ASSERT(i8count == sfp->i8count); + ASSERT((char *)sfep - (char *)sfp == dp->i_d.di_size); + ASSERT(offset + + (sfp->count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t) + + (uint)sizeof(xfs_dir2_block_tail_t) <= args->geo->blksize); +} +#endif /* DEBUG */ + +/* + * Create a new (shortform) directory. + */ +int /* error, always 0 */ +xfs_dir2_sf_create( + xfs_da_args_t *args, /* operation arguments */ + xfs_ino_t pino) /* parent inode number */ +{ + xfs_inode_t *dp; /* incore directory inode */ + int i8count; /* parent inode is an 8-byte number */ + xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ + int size; /* directory size */ + + trace_xfs_dir2_sf_create(args); + + dp = args->dp; + + ASSERT(dp != NULL); + ASSERT(dp->i_d.di_size == 0); + /* + * If it's currently a zero-length extent file, + * convert it to local format. + */ + if (dp->i_d.di_format == XFS_DINODE_FMT_EXTENTS) { + dp->i_df.if_flags &= ~XFS_IFEXTENTS; /* just in case */ + dp->i_d.di_format = XFS_DINODE_FMT_LOCAL; + xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE); + dp->i_df.if_flags |= XFS_IFINLINE; + } + ASSERT(dp->i_df.if_flags & XFS_IFINLINE); + ASSERT(dp->i_df.if_bytes == 0); + i8count = pino > XFS_DIR2_MAX_SHORT_INUM; + size = xfs_dir2_sf_hdr_size(i8count); + /* + * Make a buffer for the data. + */ + xfs_idata_realloc(dp, size, XFS_DATA_FORK); + /* + * Fill in the header, + */ + sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + sfp->i8count = i8count; + /* + * Now can put in the inode number, since i8count is set. + */ + dp->d_ops->sf_put_parent_ino(sfp, pino); + sfp->count = 0; + dp->i_d.di_size = size; + xfs_dir2_sf_check(args); + xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA); + return 0; +} + +/* + * Lookup an entry in a shortform directory. + * Returns EEXIST if found, ENOENT if not found. + */ +int /* error */ +xfs_dir2_sf_lookup( + xfs_da_args_t *args) /* operation arguments */ +{ + xfs_inode_t *dp; /* incore directory inode */ + int i; /* entry index */ + int error; + xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */ + xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ + enum xfs_dacmp cmp; /* comparison result */ + xfs_dir2_sf_entry_t *ci_sfep; /* case-insens. entry */ + + trace_xfs_dir2_sf_lookup(args); + + xfs_dir2_sf_check(args); + dp = args->dp; + + ASSERT(dp->i_df.if_flags & XFS_IFINLINE); + /* + * Bail out if the directory is way too short. + */ + if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) { + ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount)); + return -EIO; + } + ASSERT(dp->i_df.if_bytes == dp->i_d.di_size); + ASSERT(dp->i_df.if_u1.if_data != NULL); + sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->i8count)); + /* + * Special case for . + */ + if (args->namelen == 1 && args->name[0] == '.') { + args->inumber = dp->i_ino; + args->cmpresult = XFS_CMP_EXACT; + args->filetype = XFS_DIR3_FT_DIR; + return -EEXIST; + } + /* + * Special case for .. + */ + if (args->namelen == 2 && + args->name[0] == '.' && args->name[1] == '.') { + args->inumber = dp->d_ops->sf_get_parent_ino(sfp); + args->cmpresult = XFS_CMP_EXACT; + args->filetype = XFS_DIR3_FT_DIR; + return -EEXIST; + } + /* + * Loop over all the entries trying to match ours. + */ + ci_sfep = NULL; + for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count; + i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) { + /* + * Compare name and if it's an exact match, return the inode + * number. If it's the first case-insensitive match, store the + * inode number and continue looking for an exact match. + */ + cmp = dp->i_mount->m_dirnameops->compname(args, sfep->name, + sfep->namelen); + if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) { + args->cmpresult = cmp; + args->inumber = dp->d_ops->sf_get_ino(sfp, sfep); + args->filetype = dp->d_ops->sf_get_ftype(sfep); + if (cmp == XFS_CMP_EXACT) + return -EEXIST; + ci_sfep = sfep; + } + } + ASSERT(args->op_flags & XFS_DA_OP_OKNOENT); + /* + * Here, we can only be doing a lookup (not a rename or replace). + * If a case-insensitive match was not found, return -ENOENT. + */ + if (!ci_sfep) + return -ENOENT; + /* otherwise process the CI match as required by the caller */ + error = xfs_dir_cilookup_result(args, ci_sfep->name, ci_sfep->namelen); + return error; +} + +/* + * Remove an entry from a shortform directory. + */ +int /* error */ +xfs_dir2_sf_removename( + xfs_da_args_t *args) +{ + int byteoff; /* offset of removed entry */ + xfs_inode_t *dp; /* incore directory inode */ + int entsize; /* this entry's size */ + int i; /* shortform entry index */ + int newsize; /* new inode size */ + int oldsize; /* old inode size */ + xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */ + xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ + + trace_xfs_dir2_sf_removename(args); + + dp = args->dp; + + ASSERT(dp->i_df.if_flags & XFS_IFINLINE); + oldsize = (int)dp->i_d.di_size; + /* + * Bail out if the directory is way too short. + */ + if (oldsize < offsetof(xfs_dir2_sf_hdr_t, parent)) { + ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount)); + return -EIO; + } + ASSERT(dp->i_df.if_bytes == oldsize); + ASSERT(dp->i_df.if_u1.if_data != NULL); + sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + ASSERT(oldsize >= xfs_dir2_sf_hdr_size(sfp->i8count)); + /* + * Loop over the old directory entries. + * Find the one we're deleting. + */ + for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count; + i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) { + if (xfs_da_compname(args, sfep->name, sfep->namelen) == + XFS_CMP_EXACT) { + ASSERT(dp->d_ops->sf_get_ino(sfp, sfep) == + args->inumber); + break; + } + } + /* + * Didn't find it. + */ + if (i == sfp->count) + return -ENOENT; + /* + * Calculate sizes. + */ + byteoff = (int)((char *)sfep - (char *)sfp); + entsize = dp->d_ops->sf_entsize(sfp, args->namelen); + newsize = oldsize - entsize; + /* + * Copy the part if any after the removed entry, sliding it down. + */ + if (byteoff + entsize < oldsize) + memmove((char *)sfp + byteoff, (char *)sfp + byteoff + entsize, + oldsize - (byteoff + entsize)); + /* + * Fix up the header and file size. + */ + sfp->count--; + dp->i_d.di_size = newsize; + /* + * Reallocate, making it smaller. + */ + xfs_idata_realloc(dp, newsize - oldsize, XFS_DATA_FORK); + sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + /* + * Are we changing inode number size? + */ + if (args->inumber > XFS_DIR2_MAX_SHORT_INUM) { + if (sfp->i8count == 1) + xfs_dir2_sf_toino4(args); + else + sfp->i8count--; + } + xfs_dir2_sf_check(args); + xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA); + return 0; +} + +/* + * Replace the inode number of an entry in a shortform directory. + */ +int /* error */ +xfs_dir2_sf_replace( + xfs_da_args_t *args) /* operation arguments */ +{ + xfs_inode_t *dp; /* incore directory inode */ + int i; /* entry index */ + xfs_ino_t ino=0; /* entry old inode number */ + int i8elevated; /* sf_toino8 set i8count=1 */ + xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */ + xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ + + trace_xfs_dir2_sf_replace(args); + + dp = args->dp; + + ASSERT(dp->i_df.if_flags & XFS_IFINLINE); + /* + * Bail out if the shortform directory is way too small. + */ + if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) { + ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount)); + return -EIO; + } + ASSERT(dp->i_df.if_bytes == dp->i_d.di_size); + ASSERT(dp->i_df.if_u1.if_data != NULL); + sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->i8count)); + + /* + * New inode number is large, and need to convert to 8-byte inodes. + */ + if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && sfp->i8count == 0) { + int error; /* error return value */ + int newsize; /* new inode size */ + + newsize = + dp->i_df.if_bytes + + (sfp->count + 1) * + ((uint)sizeof(xfs_dir2_ino8_t) - + (uint)sizeof(xfs_dir2_ino4_t)); + /* + * Won't fit as shortform, convert to block then do replace. + */ + if (newsize > XFS_IFORK_DSIZE(dp)) { + error = xfs_dir2_sf_to_block(args); + if (error) { + return error; + } + return xfs_dir2_block_replace(args); + } + /* + * Still fits, convert to 8-byte now. + */ + xfs_dir2_sf_toino8(args); + i8elevated = 1; + sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + } else + i8elevated = 0; + + ASSERT(args->namelen != 1 || args->name[0] != '.'); + /* + * Replace ..'s entry. + */ + if (args->namelen == 2 && + args->name[0] == '.' && args->name[1] == '.') { + ino = dp->d_ops->sf_get_parent_ino(sfp); + ASSERT(args->inumber != ino); + dp->d_ops->sf_put_parent_ino(sfp, args->inumber); + } + /* + * Normal entry, look for the name. + */ + else { + for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count; + i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) { + if (xfs_da_compname(args, sfep->name, sfep->namelen) == + XFS_CMP_EXACT) { + ino = dp->d_ops->sf_get_ino(sfp, sfep); + ASSERT(args->inumber != ino); + dp->d_ops->sf_put_ino(sfp, sfep, args->inumber); + dp->d_ops->sf_put_ftype(sfep, args->filetype); + break; + } + } + /* + * Didn't find it. + */ + if (i == sfp->count) { + ASSERT(args->op_flags & XFS_DA_OP_OKNOENT); + if (i8elevated) + xfs_dir2_sf_toino4(args); + return -ENOENT; + } + } + /* + * See if the old number was large, the new number is small. + */ + if (ino > XFS_DIR2_MAX_SHORT_INUM && + args->inumber <= XFS_DIR2_MAX_SHORT_INUM) { + /* + * And the old count was one, so need to convert to small. + */ + if (sfp->i8count == 1) + xfs_dir2_sf_toino4(args); + else + sfp->i8count--; + } + /* + * See if the old number was small, the new number is large. + */ + if (ino <= XFS_DIR2_MAX_SHORT_INUM && + args->inumber > XFS_DIR2_MAX_SHORT_INUM) { + /* + * add to the i8count unless we just converted to 8-byte + * inodes (which does an implied i8count = 1) + */ + ASSERT(sfp->i8count != 0); + if (!i8elevated) + sfp->i8count++; + } + xfs_dir2_sf_check(args); + xfs_trans_log_inode(args->trans, dp, XFS_ILOG_DDATA); + return 0; +} + +/* + * Convert from 8-byte inode numbers to 4-byte inode numbers. + * The last 8-byte inode number is gone, but the count is still 1. + */ +static void +xfs_dir2_sf_toino4( + xfs_da_args_t *args) /* operation arguments */ +{ + char *buf; /* old dir's buffer */ + xfs_inode_t *dp; /* incore directory inode */ + int i; /* entry index */ + int newsize; /* new inode size */ + xfs_dir2_sf_entry_t *oldsfep; /* old sf entry */ + xfs_dir2_sf_hdr_t *oldsfp; /* old sf directory */ + int oldsize; /* old inode size */ + xfs_dir2_sf_entry_t *sfep; /* new sf entry */ + xfs_dir2_sf_hdr_t *sfp; /* new sf directory */ + + trace_xfs_dir2_sf_toino4(args); + + dp = args->dp; + + /* + * Copy the old directory to the buffer. + * Then nuke it from the inode, and add the new buffer to the inode. + * Don't want xfs_idata_realloc copying the data here. + */ + oldsize = dp->i_df.if_bytes; + buf = kmem_alloc(oldsize, KM_SLEEP); + oldsfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + ASSERT(oldsfp->i8count == 1); + memcpy(buf, oldsfp, oldsize); + /* + * Compute the new inode size. + */ + newsize = + oldsize - + (oldsfp->count + 1) * + ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t)); + xfs_idata_realloc(dp, -oldsize, XFS_DATA_FORK); + xfs_idata_realloc(dp, newsize, XFS_DATA_FORK); + /* + * Reset our pointers, the data has moved. + */ + oldsfp = (xfs_dir2_sf_hdr_t *)buf; + sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + /* + * Fill in the new header. + */ + sfp->count = oldsfp->count; + sfp->i8count = 0; + dp->d_ops->sf_put_parent_ino(sfp, dp->d_ops->sf_get_parent_ino(oldsfp)); + /* + * Copy the entries field by field. + */ + for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp), + oldsfep = xfs_dir2_sf_firstentry(oldsfp); + i < sfp->count; + i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep), + oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep)) { + sfep->namelen = oldsfep->namelen; + sfep->offset = oldsfep->offset; + memcpy(sfep->name, oldsfep->name, sfep->namelen); + dp->d_ops->sf_put_ino(sfp, sfep, + dp->d_ops->sf_get_ino(oldsfp, oldsfep)); + dp->d_ops->sf_put_ftype(sfep, dp->d_ops->sf_get_ftype(oldsfep)); + } + /* + * Clean up the inode. + */ + kmem_free(buf); + dp->i_d.di_size = newsize; + xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA); +} + +/* + * Convert existing entries from 4-byte inode numbers to 8-byte inode numbers. + * The new entry w/ an 8-byte inode number is not there yet; we leave with + * i8count set to 1, but no corresponding 8-byte entry. + */ +static void +xfs_dir2_sf_toino8( + xfs_da_args_t *args) /* operation arguments */ +{ + char *buf; /* old dir's buffer */ + xfs_inode_t *dp; /* incore directory inode */ + int i; /* entry index */ + int newsize; /* new inode size */ + xfs_dir2_sf_entry_t *oldsfep; /* old sf entry */ + xfs_dir2_sf_hdr_t *oldsfp; /* old sf directory */ + int oldsize; /* old inode size */ + xfs_dir2_sf_entry_t *sfep; /* new sf entry */ + xfs_dir2_sf_hdr_t *sfp; /* new sf directory */ + + trace_xfs_dir2_sf_toino8(args); + + dp = args->dp; + + /* + * Copy the old directory to the buffer. + * Then nuke it from the inode, and add the new buffer to the inode. + * Don't want xfs_idata_realloc copying the data here. + */ + oldsize = dp->i_df.if_bytes; + buf = kmem_alloc(oldsize, KM_SLEEP); + oldsfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + ASSERT(oldsfp->i8count == 0); + memcpy(buf, oldsfp, oldsize); + /* + * Compute the new inode size (nb: entry count + 1 for parent) + */ + newsize = + oldsize + + (oldsfp->count + 1) * + ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t)); + xfs_idata_realloc(dp, -oldsize, XFS_DATA_FORK); + xfs_idata_realloc(dp, newsize, XFS_DATA_FORK); + /* + * Reset our pointers, the data has moved. + */ + oldsfp = (xfs_dir2_sf_hdr_t *)buf; + sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + /* + * Fill in the new header. + */ + sfp->count = oldsfp->count; + sfp->i8count = 1; + dp->d_ops->sf_put_parent_ino(sfp, dp->d_ops->sf_get_parent_ino(oldsfp)); + /* + * Copy the entries field by field. + */ + for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp), + oldsfep = xfs_dir2_sf_firstentry(oldsfp); + i < sfp->count; + i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep), + oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep)) { + sfep->namelen = oldsfep->namelen; + sfep->offset = oldsfep->offset; + memcpy(sfep->name, oldsfep->name, sfep->namelen); + dp->d_ops->sf_put_ino(sfp, sfep, + dp->d_ops->sf_get_ino(oldsfp, oldsfep)); + dp->d_ops->sf_put_ftype(sfep, dp->d_ops->sf_get_ftype(oldsfep)); + } + /* + * Clean up the inode. + */ + kmem_free(buf); + dp->i_d.di_size = newsize; + xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA); +} diff --git a/kernel/fs/xfs/libxfs/xfs_dquot_buf.c b/kernel/fs/xfs/libxfs/xfs_dquot_buf.c new file mode 100644 index 000000000..6fbf2d853 --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_dquot_buf.c @@ -0,0 +1,288 @@ +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_quota.h" +#include "xfs_trans.h" +#include "xfs_qm.h" +#include "xfs_error.h" +#include "xfs_cksum.h" +#include "xfs_trace.h" + +int +xfs_calc_dquots_per_chunk( + unsigned int nbblks) /* basic block units */ +{ + unsigned int ndquots; + + ASSERT(nbblks > 0); + ndquots = BBTOB(nbblks); + do_div(ndquots, sizeof(xfs_dqblk_t)); + + return ndquots; +} + +/* + * Do some primitive error checking on ondisk dquot data structures. + */ +int +xfs_dqcheck( + struct xfs_mount *mp, + xfs_disk_dquot_t *ddq, + xfs_dqid_t id, + uint type, /* used only when IO_dorepair is true */ + uint flags, + char *str) +{ + xfs_dqblk_t *d = (xfs_dqblk_t *)ddq; + int errs = 0; + + /* + * We can encounter an uninitialized dquot buffer for 2 reasons: + * 1. If we crash while deleting the quotainode(s), and those blks got + * used for user data. This is because we take the path of regular + * file deletion; however, the size field of quotainodes is never + * updated, so all the tricks that we play in itruncate_finish + * don't quite matter. + * + * 2. We don't play the quota buffers when there's a quotaoff logitem. + * But the allocation will be replayed so we'll end up with an + * uninitialized quota block. + * + * This is all fine; things are still consistent, and we haven't lost + * any quota information. Just don't complain about bad dquot blks. + */ + if (ddq->d_magic != cpu_to_be16(XFS_DQUOT_MAGIC)) { + if (flags & XFS_QMOPT_DOWARN) + xfs_alert(mp, + "%s : XFS dquot ID 0x%x, magic 0x%x != 0x%x", + str, id, be16_to_cpu(ddq->d_magic), XFS_DQUOT_MAGIC); + errs++; + } + if (ddq->d_version != XFS_DQUOT_VERSION) { + if (flags & XFS_QMOPT_DOWARN) + xfs_alert(mp, + "%s : XFS dquot ID 0x%x, version 0x%x != 0x%x", + str, id, ddq->d_version, XFS_DQUOT_VERSION); + errs++; + } + + if (ddq->d_flags != XFS_DQ_USER && + ddq->d_flags != XFS_DQ_PROJ && + ddq->d_flags != XFS_DQ_GROUP) { + if (flags & XFS_QMOPT_DOWARN) + xfs_alert(mp, + "%s : XFS dquot ID 0x%x, unknown flags 0x%x", + str, id, ddq->d_flags); + errs++; + } + + if (id != -1 && id != be32_to_cpu(ddq->d_id)) { + if (flags & XFS_QMOPT_DOWARN) + xfs_alert(mp, + "%s : ondisk-dquot 0x%p, ID mismatch: " + "0x%x expected, found id 0x%x", + str, ddq, id, be32_to_cpu(ddq->d_id)); + errs++; + } + + if (!errs && ddq->d_id) { + if (ddq->d_blk_softlimit && + be64_to_cpu(ddq->d_bcount) > + be64_to_cpu(ddq->d_blk_softlimit)) { + if (!ddq->d_btimer) { + if (flags & XFS_QMOPT_DOWARN) + xfs_alert(mp, + "%s : Dquot ID 0x%x (0x%p) BLK TIMER NOT STARTED", + str, (int)be32_to_cpu(ddq->d_id), ddq); + errs++; + } + } + if (ddq->d_ino_softlimit && + be64_to_cpu(ddq->d_icount) > + be64_to_cpu(ddq->d_ino_softlimit)) { + if (!ddq->d_itimer) { + if (flags & XFS_QMOPT_DOWARN) + xfs_alert(mp, + "%s : Dquot ID 0x%x (0x%p) INODE TIMER NOT STARTED", + str, (int)be32_to_cpu(ddq->d_id), ddq); + errs++; + } + } + if (ddq->d_rtb_softlimit && + be64_to_cpu(ddq->d_rtbcount) > + be64_to_cpu(ddq->d_rtb_softlimit)) { + if (!ddq->d_rtbtimer) { + if (flags & XFS_QMOPT_DOWARN) + xfs_alert(mp, + "%s : Dquot ID 0x%x (0x%p) RTBLK TIMER NOT STARTED", + str, (int)be32_to_cpu(ddq->d_id), ddq); + errs++; + } + } + } + + if (!errs || !(flags & XFS_QMOPT_DQREPAIR)) + return errs; + + if (flags & XFS_QMOPT_DOWARN) + xfs_notice(mp, "Re-initializing dquot ID 0x%x", id); + + /* + * Typically, a repair is only requested by quotacheck. + */ + ASSERT(id != -1); + ASSERT(flags & XFS_QMOPT_DQREPAIR); + memset(d, 0, sizeof(xfs_dqblk_t)); + + d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC); + d->dd_diskdq.d_version = XFS_DQUOT_VERSION; + d->dd_diskdq.d_flags = type; + d->dd_diskdq.d_id = cpu_to_be32(id); + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + uuid_copy(&d->dd_uuid, &mp->m_sb.sb_uuid); + xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk), + XFS_DQUOT_CRC_OFF); + } + + return errs; +} + +STATIC bool +xfs_dquot_buf_verify_crc( + struct xfs_mount *mp, + struct xfs_buf *bp) +{ + struct xfs_dqblk *d = (struct xfs_dqblk *)bp->b_addr; + int ndquots; + int i; + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return true; + + /* + * if we are in log recovery, the quota subsystem has not been + * initialised so we have no quotainfo structure. In that case, we need + * to manually calculate the number of dquots in the buffer. + */ + if (mp->m_quotainfo) + ndquots = mp->m_quotainfo->qi_dqperchunk; + else + ndquots = xfs_calc_dquots_per_chunk( + XFS_BB_TO_FSB(mp, bp->b_length)); + + for (i = 0; i < ndquots; i++, d++) { + if (!xfs_verify_cksum((char *)d, sizeof(struct xfs_dqblk), + XFS_DQUOT_CRC_OFF)) + return false; + if (!uuid_equal(&d->dd_uuid, &mp->m_sb.sb_uuid)) + return false; + } + return true; +} + +STATIC bool +xfs_dquot_buf_verify( + struct xfs_mount *mp, + struct xfs_buf *bp) +{ + struct xfs_dqblk *d = (struct xfs_dqblk *)bp->b_addr; + xfs_dqid_t id = 0; + int ndquots; + int i; + + /* + * if we are in log recovery, the quota subsystem has not been + * initialised so we have no quotainfo structure. In that case, we need + * to manually calculate the number of dquots in the buffer. + */ + if (mp->m_quotainfo) + ndquots = mp->m_quotainfo->qi_dqperchunk; + else + ndquots = xfs_calc_dquots_per_chunk(bp->b_length); + + /* + * On the first read of the buffer, verify that each dquot is valid. + * We don't know what the id of the dquot is supposed to be, just that + * they should be increasing monotonically within the buffer. If the + * first id is corrupt, then it will fail on the second dquot in the + * buffer so corruptions could point to the wrong dquot in this case. + */ + for (i = 0; i < ndquots; i++) { + struct xfs_disk_dquot *ddq; + int error; + + ddq = &d[i].dd_diskdq; + + if (i == 0) + id = be32_to_cpu(ddq->d_id); + + error = xfs_dqcheck(mp, ddq, id + i, 0, XFS_QMOPT_DOWARN, + "xfs_dquot_buf_verify"); + if (error) + return false; + } + return true; +} + +static void +xfs_dquot_buf_read_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + + if (!xfs_dquot_buf_verify_crc(mp, bp)) + xfs_buf_ioerror(bp, -EFSBADCRC); + else if (!xfs_dquot_buf_verify(mp, bp)) + xfs_buf_ioerror(bp, -EFSCORRUPTED); + + if (bp->b_error) + xfs_verifier_error(bp); +} + +/* + * we don't calculate the CRC here as that is done when the dquot is flushed to + * the buffer after the update is done. This ensures that the dquot in the + * buffer always has an up-to-date CRC value. + */ +static void +xfs_dquot_buf_write_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + + if (!xfs_dquot_buf_verify(mp, bp)) { + xfs_buf_ioerror(bp, -EFSCORRUPTED); + xfs_verifier_error(bp); + return; + } +} + +const struct xfs_buf_ops xfs_dquot_buf_ops = { + .verify_read = xfs_dquot_buf_read_verify, + .verify_write = xfs_dquot_buf_write_verify, +}; + diff --git a/kernel/fs/xfs/libxfs/xfs_format.h b/kernel/fs/xfs/libxfs/xfs_format.h new file mode 100644 index 000000000..4daaa6623 --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_format.h @@ -0,0 +1,1461 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_FORMAT_H__ +#define __XFS_FORMAT_H__ + +/* + * XFS On Disk Format Definitions + * + * This header file defines all the on-disk format definitions for + * general XFS objects. Directory and attribute related objects are defined in + * xfs_da_format.h, which log and log item formats are defined in + * xfs_log_format.h. Everything else goes here. + */ + +struct xfs_mount; +struct xfs_trans; +struct xfs_inode; +struct xfs_buf; +struct xfs_ifork; + +/* + * Super block + * Fits into a sector-sized buffer at address 0 of each allocation group. + * Only the first of these is ever updated except during growfs. + */ +#define XFS_SB_MAGIC 0x58465342 /* 'XFSB' */ +#define XFS_SB_VERSION_1 1 /* 5.3, 6.0.1, 6.1 */ +#define XFS_SB_VERSION_2 2 /* 6.2 - attributes */ +#define XFS_SB_VERSION_3 3 /* 6.2 - new inode version */ +#define XFS_SB_VERSION_4 4 /* 6.2+ - bitmask version */ +#define XFS_SB_VERSION_5 5 /* CRC enabled filesystem */ +#define XFS_SB_VERSION_NUMBITS 0x000f +#define XFS_SB_VERSION_ALLFBITS 0xfff0 +#define XFS_SB_VERSION_ATTRBIT 0x0010 +#define XFS_SB_VERSION_NLINKBIT 0x0020 +#define XFS_SB_VERSION_QUOTABIT 0x0040 +#define XFS_SB_VERSION_ALIGNBIT 0x0080 +#define XFS_SB_VERSION_DALIGNBIT 0x0100 +#define XFS_SB_VERSION_SHAREDBIT 0x0200 +#define XFS_SB_VERSION_LOGV2BIT 0x0400 +#define XFS_SB_VERSION_SECTORBIT 0x0800 +#define XFS_SB_VERSION_EXTFLGBIT 0x1000 +#define XFS_SB_VERSION_DIRV2BIT 0x2000 +#define XFS_SB_VERSION_BORGBIT 0x4000 /* ASCII only case-insens. */ +#define XFS_SB_VERSION_MOREBITSBIT 0x8000 + +/* + * Supported feature bit list is just all bits in the versionnum field because + * we've used them all up and understand them all. Except, of course, for the + * shared superblock bit, which nobody knows what it does and so is unsupported. + */ +#define XFS_SB_VERSION_OKBITS \ + ((XFS_SB_VERSION_NUMBITS | XFS_SB_VERSION_ALLFBITS) & \ + ~XFS_SB_VERSION_SHAREDBIT) + +/* + * There are two words to hold XFS "feature" bits: the original + * word, sb_versionnum, and sb_features2. Whenever a bit is set in + * sb_features2, the feature bit XFS_SB_VERSION_MOREBITSBIT must be set. + * + * These defines represent bits in sb_features2. + */ +#define XFS_SB_VERSION2_RESERVED1BIT 0x00000001 +#define XFS_SB_VERSION2_LAZYSBCOUNTBIT 0x00000002 /* Superblk counters */ +#define XFS_SB_VERSION2_RESERVED4BIT 0x00000004 +#define XFS_SB_VERSION2_ATTR2BIT 0x00000008 /* Inline attr rework */ +#define XFS_SB_VERSION2_PARENTBIT 0x00000010 /* parent pointers */ +#define XFS_SB_VERSION2_PROJID32BIT 0x00000080 /* 32 bit project id */ +#define XFS_SB_VERSION2_CRCBIT 0x00000100 /* metadata CRCs */ +#define XFS_SB_VERSION2_FTYPE 0x00000200 /* inode type in dir */ + +#define XFS_SB_VERSION2_OKBITS \ + (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \ + XFS_SB_VERSION2_ATTR2BIT | \ + XFS_SB_VERSION2_PROJID32BIT | \ + XFS_SB_VERSION2_FTYPE) + +/* + * Superblock - in core version. Must match the ondisk version below. + * Must be padded to 64 bit alignment. + */ +typedef struct xfs_sb { + __uint32_t sb_magicnum; /* magic number == XFS_SB_MAGIC */ + __uint32_t sb_blocksize; /* logical block size, bytes */ + xfs_rfsblock_t sb_dblocks; /* number of data blocks */ + xfs_rfsblock_t sb_rblocks; /* number of realtime blocks */ + xfs_rtblock_t sb_rextents; /* number of realtime extents */ + uuid_t sb_uuid; /* file system unique id */ + xfs_fsblock_t sb_logstart; /* starting block of log if internal */ + xfs_ino_t sb_rootino; /* root inode number */ + xfs_ino_t sb_rbmino; /* bitmap inode for realtime extents */ + xfs_ino_t sb_rsumino; /* summary inode for rt bitmap */ + xfs_agblock_t sb_rextsize; /* realtime extent size, blocks */ + xfs_agblock_t sb_agblocks; /* size of an allocation group */ + xfs_agnumber_t sb_agcount; /* number of allocation groups */ + xfs_extlen_t sb_rbmblocks; /* number of rt bitmap blocks */ + xfs_extlen_t sb_logblocks; /* number of log blocks */ + __uint16_t sb_versionnum; /* header version == XFS_SB_VERSION */ + __uint16_t sb_sectsize; /* volume sector size, bytes */ + __uint16_t sb_inodesize; /* inode size, bytes */ + __uint16_t sb_inopblock; /* inodes per block */ + char sb_fname[12]; /* file system name */ + __uint8_t sb_blocklog; /* log2 of sb_blocksize */ + __uint8_t sb_sectlog; /* log2 of sb_sectsize */ + __uint8_t sb_inodelog; /* log2 of sb_inodesize */ + __uint8_t sb_inopblog; /* log2 of sb_inopblock */ + __uint8_t sb_agblklog; /* log2 of sb_agblocks (rounded up) */ + __uint8_t sb_rextslog; /* log2 of sb_rextents */ + __uint8_t sb_inprogress; /* mkfs is in progress, don't mount */ + __uint8_t sb_imax_pct; /* max % of fs for inode space */ + /* statistics */ + /* + * These fields must remain contiguous. If you really + * want to change their layout, make sure you fix the + * code in xfs_trans_apply_sb_deltas(). + */ + __uint64_t sb_icount; /* allocated inodes */ + __uint64_t sb_ifree; /* free inodes */ + __uint64_t sb_fdblocks; /* free data blocks */ + __uint64_t sb_frextents; /* free realtime extents */ + /* + * End contiguous fields. + */ + xfs_ino_t sb_uquotino; /* user quota inode */ + xfs_ino_t sb_gquotino; /* group quota inode */ + __uint16_t sb_qflags; /* quota flags */ + __uint8_t sb_flags; /* misc. flags */ + __uint8_t sb_shared_vn; /* shared version number */ + xfs_extlen_t sb_inoalignmt; /* inode chunk alignment, fsblocks */ + __uint32_t sb_unit; /* stripe or raid unit */ + __uint32_t sb_width; /* stripe or raid width */ + __uint8_t sb_dirblklog; /* log2 of dir block size (fsbs) */ + __uint8_t sb_logsectlog; /* log2 of the log sector size */ + __uint16_t sb_logsectsize; /* sector size for the log, bytes */ + __uint32_t sb_logsunit; /* stripe unit size for the log */ + __uint32_t sb_features2; /* additional feature bits */ + + /* + * bad features2 field as a result of failing to pad the sb structure to + * 64 bits. Some machines will be using this field for features2 bits. + * Easiest just to mark it bad and not use it for anything else. + * + * This is not kept up to date in memory; it is always overwritten by + * the value in sb_features2 when formatting the incore superblock to + * the disk buffer. + */ + __uint32_t sb_bad_features2; + + /* version 5 superblock fields start here */ + + /* feature masks */ + __uint32_t sb_features_compat; + __uint32_t sb_features_ro_compat; + __uint32_t sb_features_incompat; + __uint32_t sb_features_log_incompat; + + __uint32_t sb_crc; /* superblock crc */ + __uint32_t sb_pad; + + xfs_ino_t sb_pquotino; /* project quota inode */ + xfs_lsn_t sb_lsn; /* last write sequence */ + + /* must be padded to 64 bit alignment */ +} xfs_sb_t; + +#define XFS_SB_CRC_OFF offsetof(struct xfs_sb, sb_crc) + +/* + * Superblock - on disk version. Must match the in core version above. + * Must be padded to 64 bit alignment. + */ +typedef struct xfs_dsb { + __be32 sb_magicnum; /* magic number == XFS_SB_MAGIC */ + __be32 sb_blocksize; /* logical block size, bytes */ + __be64 sb_dblocks; /* number of data blocks */ + __be64 sb_rblocks; /* number of realtime blocks */ + __be64 sb_rextents; /* number of realtime extents */ + uuid_t sb_uuid; /* file system unique id */ + __be64 sb_logstart; /* starting block of log if internal */ + __be64 sb_rootino; /* root inode number */ + __be64 sb_rbmino; /* bitmap inode for realtime extents */ + __be64 sb_rsumino; /* summary inode for rt bitmap */ + __be32 sb_rextsize; /* realtime extent size, blocks */ + __be32 sb_agblocks; /* size of an allocation group */ + __be32 sb_agcount; /* number of allocation groups */ + __be32 sb_rbmblocks; /* number of rt bitmap blocks */ + __be32 sb_logblocks; /* number of log blocks */ + __be16 sb_versionnum; /* header version == XFS_SB_VERSION */ + __be16 sb_sectsize; /* volume sector size, bytes */ + __be16 sb_inodesize; /* inode size, bytes */ + __be16 sb_inopblock; /* inodes per block */ + char sb_fname[12]; /* file system name */ + __u8 sb_blocklog; /* log2 of sb_blocksize */ + __u8 sb_sectlog; /* log2 of sb_sectsize */ + __u8 sb_inodelog; /* log2 of sb_inodesize */ + __u8 sb_inopblog; /* log2 of sb_inopblock */ + __u8 sb_agblklog; /* log2 of sb_agblocks (rounded up) */ + __u8 sb_rextslog; /* log2 of sb_rextents */ + __u8 sb_inprogress; /* mkfs is in progress, don't mount */ + __u8 sb_imax_pct; /* max % of fs for inode space */ + /* statistics */ + /* + * These fields must remain contiguous. If you really + * want to change their layout, make sure you fix the + * code in xfs_trans_apply_sb_deltas(). + */ + __be64 sb_icount; /* allocated inodes */ + __be64 sb_ifree; /* free inodes */ + __be64 sb_fdblocks; /* free data blocks */ + __be64 sb_frextents; /* free realtime extents */ + /* + * End contiguous fields. + */ + __be64 sb_uquotino; /* user quota inode */ + __be64 sb_gquotino; /* group quota inode */ + __be16 sb_qflags; /* quota flags */ + __u8 sb_flags; /* misc. flags */ + __u8 sb_shared_vn; /* shared version number */ + __be32 sb_inoalignmt; /* inode chunk alignment, fsblocks */ + __be32 sb_unit; /* stripe or raid unit */ + __be32 sb_width; /* stripe or raid width */ + __u8 sb_dirblklog; /* log2 of dir block size (fsbs) */ + __u8 sb_logsectlog; /* log2 of the log sector size */ + __be16 sb_logsectsize; /* sector size for the log, bytes */ + __be32 sb_logsunit; /* stripe unit size for the log */ + __be32 sb_features2; /* additional feature bits */ + /* + * bad features2 field as a result of failing to pad the sb + * structure to 64 bits. Some machines will be using this field + * for features2 bits. Easiest just to mark it bad and not use + * it for anything else. + */ + __be32 sb_bad_features2; + + /* version 5 superblock fields start here */ + + /* feature masks */ + __be32 sb_features_compat; + __be32 sb_features_ro_compat; + __be32 sb_features_incompat; + __be32 sb_features_log_incompat; + + __le32 sb_crc; /* superblock crc */ + __be32 sb_pad; + + __be64 sb_pquotino; /* project quota inode */ + __be64 sb_lsn; /* last write sequence */ + + /* must be padded to 64 bit alignment */ +} xfs_dsb_t; + + +/* + * Misc. Flags - warning - these will be cleared by xfs_repair unless + * a feature bit is set when the flag is used. + */ +#define XFS_SBF_NOFLAGS 0x00 /* no flags set */ +#define XFS_SBF_READONLY 0x01 /* only read-only mounts allowed */ + +/* + * define max. shared version we can interoperate with + */ +#define XFS_SB_MAX_SHARED_VN 0 + +#define XFS_SB_VERSION_NUM(sbp) ((sbp)->sb_versionnum & XFS_SB_VERSION_NUMBITS) + +/* + * The first XFS version we support is a v4 superblock with V2 directories. + */ +static inline bool xfs_sb_good_v4_features(struct xfs_sb *sbp) +{ + if (!(sbp->sb_versionnum & XFS_SB_VERSION_DIRV2BIT)) + return false; + + /* check for unknown features in the fs */ + if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS) || + ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) && + (sbp->sb_features2 & ~XFS_SB_VERSION2_OKBITS))) + return false; + + return true; +} + +static inline bool xfs_sb_good_version(struct xfs_sb *sbp) +{ + if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) + return true; + if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) + return xfs_sb_good_v4_features(sbp); + return false; +} + +/* + * Detect a mismatched features2 field. Older kernels read/wrote + * this into the wrong slot, so to be safe we keep them in sync. + */ +static inline bool xfs_sb_has_mismatched_features2(struct xfs_sb *sbp) +{ + return sbp->sb_bad_features2 != sbp->sb_features2; +} + +static inline bool xfs_sb_version_hasattr(struct xfs_sb *sbp) +{ + return (sbp->sb_versionnum & XFS_SB_VERSION_ATTRBIT); +} + +static inline void xfs_sb_version_addattr(struct xfs_sb *sbp) +{ + sbp->sb_versionnum |= XFS_SB_VERSION_ATTRBIT; +} + +static inline bool xfs_sb_version_hasquota(struct xfs_sb *sbp) +{ + return (sbp->sb_versionnum & XFS_SB_VERSION_QUOTABIT); +} + +static inline void xfs_sb_version_addquota(struct xfs_sb *sbp) +{ + sbp->sb_versionnum |= XFS_SB_VERSION_QUOTABIT; +} + +static inline bool xfs_sb_version_hasalign(struct xfs_sb *sbp) +{ + return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 || + (sbp->sb_versionnum & XFS_SB_VERSION_ALIGNBIT)); +} + +static inline bool xfs_sb_version_hasdalign(struct xfs_sb *sbp) +{ + return (sbp->sb_versionnum & XFS_SB_VERSION_DALIGNBIT); +} + +static inline bool xfs_sb_version_haslogv2(struct xfs_sb *sbp) +{ + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 || + (sbp->sb_versionnum & XFS_SB_VERSION_LOGV2BIT); +} + +static inline bool xfs_sb_version_hasextflgbit(struct xfs_sb *sbp) +{ + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 || + (sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT); +} + +static inline bool xfs_sb_version_hassector(struct xfs_sb *sbp) +{ + return (sbp->sb_versionnum & XFS_SB_VERSION_SECTORBIT); +} + +static inline bool xfs_sb_version_hasasciici(struct xfs_sb *sbp) +{ + return (sbp->sb_versionnum & XFS_SB_VERSION_BORGBIT); +} + +static inline bool xfs_sb_version_hasmorebits(struct xfs_sb *sbp) +{ + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 || + (sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT); +} + +/* + * sb_features2 bit version macros. + */ +static inline bool xfs_sb_version_haslazysbcount(struct xfs_sb *sbp) +{ + return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) || + (xfs_sb_version_hasmorebits(sbp) && + (sbp->sb_features2 & XFS_SB_VERSION2_LAZYSBCOUNTBIT)); +} + +static inline bool xfs_sb_version_hasattr2(struct xfs_sb *sbp) +{ + return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) || + (xfs_sb_version_hasmorebits(sbp) && + (sbp->sb_features2 & XFS_SB_VERSION2_ATTR2BIT)); +} + +static inline void xfs_sb_version_addattr2(struct xfs_sb *sbp) +{ + sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT; + sbp->sb_features2 |= XFS_SB_VERSION2_ATTR2BIT; +} + +static inline void xfs_sb_version_removeattr2(struct xfs_sb *sbp) +{ + sbp->sb_features2 &= ~XFS_SB_VERSION2_ATTR2BIT; + if (!sbp->sb_features2) + sbp->sb_versionnum &= ~XFS_SB_VERSION_MOREBITSBIT; +} + +static inline bool xfs_sb_version_hasprojid32bit(struct xfs_sb *sbp) +{ + return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) || + (xfs_sb_version_hasmorebits(sbp) && + (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT)); +} + +static inline void xfs_sb_version_addprojid32bit(struct xfs_sb *sbp) +{ + sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT; + sbp->sb_features2 |= XFS_SB_VERSION2_PROJID32BIT; +} + +/* + * Extended v5 superblock feature masks. These are to be used for new v5 + * superblock features only. + * + * Compat features are new features that old kernels will not notice or affect + * and so can mount read-write without issues. + * + * RO-Compat (read only) are features that old kernels can read but will break + * if they write. Hence only read-only mounts of such filesystems are allowed on + * kernels that don't support the feature bit. + * + * InCompat features are features which old kernels will not understand and so + * must not mount. + * + * Log-InCompat features are for changes to log formats or new transactions that + * can't be replayed on older kernels. The fields are set when the filesystem is + * mounted, and a clean unmount clears the fields. + */ +#define XFS_SB_FEAT_COMPAT_ALL 0 +#define XFS_SB_FEAT_COMPAT_UNKNOWN ~XFS_SB_FEAT_COMPAT_ALL +static inline bool +xfs_sb_has_compat_feature( + struct xfs_sb *sbp, + __uint32_t feature) +{ + return (sbp->sb_features_compat & feature) != 0; +} + +#define XFS_SB_FEAT_RO_COMPAT_FINOBT (1 << 0) /* free inode btree */ +#define XFS_SB_FEAT_RO_COMPAT_ALL \ + (XFS_SB_FEAT_RO_COMPAT_FINOBT) +#define XFS_SB_FEAT_RO_COMPAT_UNKNOWN ~XFS_SB_FEAT_RO_COMPAT_ALL +static inline bool +xfs_sb_has_ro_compat_feature( + struct xfs_sb *sbp, + __uint32_t feature) +{ + return (sbp->sb_features_ro_compat & feature) != 0; +} + +#define XFS_SB_FEAT_INCOMPAT_FTYPE (1 << 0) /* filetype in dirent */ +#define XFS_SB_FEAT_INCOMPAT_ALL \ + (XFS_SB_FEAT_INCOMPAT_FTYPE) + +#define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL +static inline bool +xfs_sb_has_incompat_feature( + struct xfs_sb *sbp, + __uint32_t feature) +{ + return (sbp->sb_features_incompat & feature) != 0; +} + +#define XFS_SB_FEAT_INCOMPAT_LOG_ALL 0 +#define XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_LOG_ALL +static inline bool +xfs_sb_has_incompat_log_feature( + struct xfs_sb *sbp, + __uint32_t feature) +{ + return (sbp->sb_features_log_incompat & feature) != 0; +} + +/* + * V5 superblock specific feature checks + */ +static inline int xfs_sb_version_hascrc(struct xfs_sb *sbp) +{ + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5; +} + +static inline int xfs_sb_version_has_pquotino(struct xfs_sb *sbp) +{ + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5; +} + +static inline int xfs_sb_version_hasftype(struct xfs_sb *sbp) +{ + return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 && + xfs_sb_has_incompat_feature(sbp, XFS_SB_FEAT_INCOMPAT_FTYPE)) || + (xfs_sb_version_hasmorebits(sbp) && + (sbp->sb_features2 & XFS_SB_VERSION2_FTYPE)); +} + +static inline int xfs_sb_version_hasfinobt(xfs_sb_t *sbp) +{ + return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) && + (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_FINOBT); +} + +/* + * end of superblock version macros + */ + +static inline bool +xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino) +{ + return (ino == sbp->sb_uquotino || + ino == sbp->sb_gquotino || + ino == sbp->sb_pquotino); +} + +#define XFS_SB_DADDR ((xfs_daddr_t)0) /* daddr in filesystem/ag */ +#define XFS_SB_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_SB_DADDR) +#define XFS_BUF_TO_SBP(bp) ((xfs_dsb_t *)((bp)->b_addr)) + +#define XFS_HDR_BLOCK(mp,d) ((xfs_agblock_t)XFS_BB_TO_FSBT(mp,d)) +#define XFS_DADDR_TO_FSB(mp,d) XFS_AGB_TO_FSB(mp, \ + xfs_daddr_to_agno(mp,d), xfs_daddr_to_agbno(mp,d)) +#define XFS_FSB_TO_DADDR(mp,fsbno) XFS_AGB_TO_DADDR(mp, \ + XFS_FSB_TO_AGNO(mp,fsbno), XFS_FSB_TO_AGBNO(mp,fsbno)) + +/* + * File system sector to basic block conversions. + */ +#define XFS_FSS_TO_BB(mp,sec) ((sec) << (mp)->m_sectbb_log) + +/* + * File system block to basic block conversions. + */ +#define XFS_FSB_TO_BB(mp,fsbno) ((fsbno) << (mp)->m_blkbb_log) +#define XFS_BB_TO_FSB(mp,bb) \ + (((bb) + (XFS_FSB_TO_BB(mp,1) - 1)) >> (mp)->m_blkbb_log) +#define XFS_BB_TO_FSBT(mp,bb) ((bb) >> (mp)->m_blkbb_log) + +/* + * File system block to byte conversions. + */ +#define XFS_FSB_TO_B(mp,fsbno) ((xfs_fsize_t)(fsbno) << (mp)->m_sb.sb_blocklog) +#define XFS_B_TO_FSB(mp,b) \ + ((((__uint64_t)(b)) + (mp)->m_blockmask) >> (mp)->m_sb.sb_blocklog) +#define XFS_B_TO_FSBT(mp,b) (((__uint64_t)(b)) >> (mp)->m_sb.sb_blocklog) +#define XFS_B_FSB_OFFSET(mp,b) ((b) & (mp)->m_blockmask) + +/* + * Allocation group header + * + * This is divided into three structures, placed in sequential 512-byte + * buffers after a copy of the superblock (also in a 512-byte buffer). + */ +#define XFS_AGF_MAGIC 0x58414746 /* 'XAGF' */ +#define XFS_AGI_MAGIC 0x58414749 /* 'XAGI' */ +#define XFS_AGFL_MAGIC 0x5841464c /* 'XAFL' */ +#define XFS_AGF_VERSION 1 +#define XFS_AGI_VERSION 1 + +#define XFS_AGF_GOOD_VERSION(v) ((v) == XFS_AGF_VERSION) +#define XFS_AGI_GOOD_VERSION(v) ((v) == XFS_AGI_VERSION) + +/* + * Btree number 0 is bno, 1 is cnt. This value gives the size of the + * arrays below. + */ +#define XFS_BTNUM_AGF ((int)XFS_BTNUM_CNTi + 1) + +/* + * The second word of agf_levels in the first a.g. overlaps the EFS + * superblock's magic number. Since the magic numbers valid for EFS + * are > 64k, our value cannot be confused for an EFS superblock's. + */ + +typedef struct xfs_agf { + /* + * Common allocation group header information + */ + __be32 agf_magicnum; /* magic number == XFS_AGF_MAGIC */ + __be32 agf_versionnum; /* header version == XFS_AGF_VERSION */ + __be32 agf_seqno; /* sequence # starting from 0 */ + __be32 agf_length; /* size in blocks of a.g. */ + /* + * Freespace information + */ + __be32 agf_roots[XFS_BTNUM_AGF]; /* root blocks */ + __be32 agf_spare0; /* spare field */ + __be32 agf_levels[XFS_BTNUM_AGF]; /* btree levels */ + __be32 agf_spare1; /* spare field */ + + __be32 agf_flfirst; /* first freelist block's index */ + __be32 agf_fllast; /* last freelist block's index */ + __be32 agf_flcount; /* count of blocks in freelist */ + __be32 agf_freeblks; /* total free blocks */ + + __be32 agf_longest; /* longest free space */ + __be32 agf_btreeblks; /* # of blocks held in AGF btrees */ + uuid_t agf_uuid; /* uuid of filesystem */ + + /* + * reserve some contiguous space for future logged fields before we add + * the unlogged fields. This makes the range logging via flags and + * structure offsets much simpler. + */ + __be64 agf_spare64[16]; + + /* unlogged fields, written during buffer writeback. */ + __be64 agf_lsn; /* last write sequence */ + __be32 agf_crc; /* crc of agf sector */ + __be32 agf_spare2; + + /* structure must be padded to 64 bit alignment */ +} xfs_agf_t; + +#define XFS_AGF_CRC_OFF offsetof(struct xfs_agf, agf_crc) + +#define XFS_AGF_MAGICNUM 0x00000001 +#define XFS_AGF_VERSIONNUM 0x00000002 +#define XFS_AGF_SEQNO 0x00000004 +#define XFS_AGF_LENGTH 0x00000008 +#define XFS_AGF_ROOTS 0x00000010 +#define XFS_AGF_LEVELS 0x00000020 +#define XFS_AGF_FLFIRST 0x00000040 +#define XFS_AGF_FLLAST 0x00000080 +#define XFS_AGF_FLCOUNT 0x00000100 +#define XFS_AGF_FREEBLKS 0x00000200 +#define XFS_AGF_LONGEST 0x00000400 +#define XFS_AGF_BTREEBLKS 0x00000800 +#define XFS_AGF_UUID 0x00001000 +#define XFS_AGF_NUM_BITS 13 +#define XFS_AGF_ALL_BITS ((1 << XFS_AGF_NUM_BITS) - 1) + +#define XFS_AGF_FLAGS \ + { XFS_AGF_MAGICNUM, "MAGICNUM" }, \ + { XFS_AGF_VERSIONNUM, "VERSIONNUM" }, \ + { XFS_AGF_SEQNO, "SEQNO" }, \ + { XFS_AGF_LENGTH, "LENGTH" }, \ + { XFS_AGF_ROOTS, "ROOTS" }, \ + { XFS_AGF_LEVELS, "LEVELS" }, \ + { XFS_AGF_FLFIRST, "FLFIRST" }, \ + { XFS_AGF_FLLAST, "FLLAST" }, \ + { XFS_AGF_FLCOUNT, "FLCOUNT" }, \ + { XFS_AGF_FREEBLKS, "FREEBLKS" }, \ + { XFS_AGF_LONGEST, "LONGEST" }, \ + { XFS_AGF_BTREEBLKS, "BTREEBLKS" }, \ + { XFS_AGF_UUID, "UUID" } + +/* disk block (xfs_daddr_t) in the AG */ +#define XFS_AGF_DADDR(mp) ((xfs_daddr_t)(1 << (mp)->m_sectbb_log)) +#define XFS_AGF_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGF_DADDR(mp)) +#define XFS_BUF_TO_AGF(bp) ((xfs_agf_t *)((bp)->b_addr)) + +/* + * Size of the unlinked inode hash table in the agi. + */ +#define XFS_AGI_UNLINKED_BUCKETS 64 + +typedef struct xfs_agi { + /* + * Common allocation group header information + */ + __be32 agi_magicnum; /* magic number == XFS_AGI_MAGIC */ + __be32 agi_versionnum; /* header version == XFS_AGI_VERSION */ + __be32 agi_seqno; /* sequence # starting from 0 */ + __be32 agi_length; /* size in blocks of a.g. */ + /* + * Inode information + * Inodes are mapped by interpreting the inode number, so no + * mapping data is needed here. + */ + __be32 agi_count; /* count of allocated inodes */ + __be32 agi_root; /* root of inode btree */ + __be32 agi_level; /* levels in inode btree */ + __be32 agi_freecount; /* number of free inodes */ + + __be32 agi_newino; /* new inode just allocated */ + __be32 agi_dirino; /* last directory inode chunk */ + /* + * Hash table of inodes which have been unlinked but are + * still being referenced. + */ + __be32 agi_unlinked[XFS_AGI_UNLINKED_BUCKETS]; + /* + * This marks the end of logging region 1 and start of logging region 2. + */ + uuid_t agi_uuid; /* uuid of filesystem */ + __be32 agi_crc; /* crc of agi sector */ + __be32 agi_pad32; + __be64 agi_lsn; /* last write sequence */ + + __be32 agi_free_root; /* root of the free inode btree */ + __be32 agi_free_level;/* levels in free inode btree */ + + /* structure must be padded to 64 bit alignment */ +} xfs_agi_t; + +#define XFS_AGI_CRC_OFF offsetof(struct xfs_agi, agi_crc) + +#define XFS_AGI_MAGICNUM (1 << 0) +#define XFS_AGI_VERSIONNUM (1 << 1) +#define XFS_AGI_SEQNO (1 << 2) +#define XFS_AGI_LENGTH (1 << 3) +#define XFS_AGI_COUNT (1 << 4) +#define XFS_AGI_ROOT (1 << 5) +#define XFS_AGI_LEVEL (1 << 6) +#define XFS_AGI_FREECOUNT (1 << 7) +#define XFS_AGI_NEWINO (1 << 8) +#define XFS_AGI_DIRINO (1 << 9) +#define XFS_AGI_UNLINKED (1 << 10) +#define XFS_AGI_NUM_BITS_R1 11 /* end of the 1st agi logging region */ +#define XFS_AGI_ALL_BITS_R1 ((1 << XFS_AGI_NUM_BITS_R1) - 1) +#define XFS_AGI_FREE_ROOT (1 << 11) +#define XFS_AGI_FREE_LEVEL (1 << 12) +#define XFS_AGI_NUM_BITS_R2 13 + +/* disk block (xfs_daddr_t) in the AG */ +#define XFS_AGI_DADDR(mp) ((xfs_daddr_t)(2 << (mp)->m_sectbb_log)) +#define XFS_AGI_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGI_DADDR(mp)) +#define XFS_BUF_TO_AGI(bp) ((xfs_agi_t *)((bp)->b_addr)) + +/* + * The third a.g. block contains the a.g. freelist, an array + * of block pointers to blocks owned by the allocation btree code. + */ +#define XFS_AGFL_DADDR(mp) ((xfs_daddr_t)(3 << (mp)->m_sectbb_log)) +#define XFS_AGFL_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGFL_DADDR(mp)) +#define XFS_BUF_TO_AGFL(bp) ((xfs_agfl_t *)((bp)->b_addr)) + +#define XFS_BUF_TO_AGFL_BNO(mp, bp) \ + (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \ + &(XFS_BUF_TO_AGFL(bp)->agfl_bno[0]) : \ + (__be32 *)(bp)->b_addr) + +/* + * Size of the AGFL. For CRC-enabled filesystes we steal a couple of + * slots in the beginning of the block for a proper header with the + * location information and CRC. + */ +#define XFS_AGFL_SIZE(mp) \ + (((mp)->m_sb.sb_sectsize - \ + (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \ + sizeof(struct xfs_agfl) : 0)) / \ + sizeof(xfs_agblock_t)) + +typedef struct xfs_agfl { + __be32 agfl_magicnum; + __be32 agfl_seqno; + uuid_t agfl_uuid; + __be64 agfl_lsn; + __be32 agfl_crc; + __be32 agfl_bno[]; /* actually XFS_AGFL_SIZE(mp) */ +} xfs_agfl_t; + +#define XFS_AGFL_CRC_OFF offsetof(struct xfs_agfl, agfl_crc) + + +#define XFS_AG_MAXLEVELS(mp) ((mp)->m_ag_maxlevels) +#define XFS_MIN_FREELIST_RAW(bl,cl,mp) \ + (MIN(bl + 1, XFS_AG_MAXLEVELS(mp)) + MIN(cl + 1, XFS_AG_MAXLEVELS(mp))) +#define XFS_MIN_FREELIST(a,mp) \ + (XFS_MIN_FREELIST_RAW( \ + be32_to_cpu((a)->agf_levels[XFS_BTNUM_BNOi]), \ + be32_to_cpu((a)->agf_levels[XFS_BTNUM_CNTi]), mp)) +#define XFS_MIN_FREELIST_PAG(pag,mp) \ + (XFS_MIN_FREELIST_RAW( \ + (unsigned int)(pag)->pagf_levels[XFS_BTNUM_BNOi], \ + (unsigned int)(pag)->pagf_levels[XFS_BTNUM_CNTi], mp)) + +#define XFS_AGB_TO_FSB(mp,agno,agbno) \ + (((xfs_fsblock_t)(agno) << (mp)->m_sb.sb_agblklog) | (agbno)) +#define XFS_FSB_TO_AGNO(mp,fsbno) \ + ((xfs_agnumber_t)((fsbno) >> (mp)->m_sb.sb_agblklog)) +#define XFS_FSB_TO_AGBNO(mp,fsbno) \ + ((xfs_agblock_t)((fsbno) & xfs_mask32lo((mp)->m_sb.sb_agblklog))) +#define XFS_AGB_TO_DADDR(mp,agno,agbno) \ + ((xfs_daddr_t)XFS_FSB_TO_BB(mp, \ + (xfs_fsblock_t)(agno) * (mp)->m_sb.sb_agblocks + (agbno))) +#define XFS_AG_DADDR(mp,agno,d) (XFS_AGB_TO_DADDR(mp, agno, 0) + (d)) + +/* + * For checking for bad ranges of xfs_daddr_t's, covering multiple + * allocation groups or a single xfs_daddr_t that's a superblock copy. + */ +#define XFS_AG_CHECK_DADDR(mp,d,len) \ + ((len) == 1 ? \ + ASSERT((d) == XFS_SB_DADDR || \ + xfs_daddr_to_agbno(mp, d) != XFS_SB_DADDR) : \ + ASSERT(xfs_daddr_to_agno(mp, d) == \ + xfs_daddr_to_agno(mp, (d) + (len) - 1))) + +typedef struct xfs_timestamp { + __be32 t_sec; /* timestamp seconds */ + __be32 t_nsec; /* timestamp nanoseconds */ +} xfs_timestamp_t; + +/* + * On-disk inode structure. + * + * This is just the header or "dinode core", the inode is expanded to fill a + * variable size the leftover area split into a data and an attribute fork. + * The format of the data and attribute fork depends on the format of the + * inode as indicated by di_format and di_aformat. To access the data and + * attribute use the XFS_DFORK_DPTR, XFS_DFORK_APTR, and XFS_DFORK_PTR macros + * below. + * + * There is a very similar struct icdinode in xfs_inode which matches the + * layout of the first 96 bytes of this structure, but is kept in native + * format instead of big endian. + * + * Note: di_flushiter is only used by v1/2 inodes - it's effectively a zeroed + * padding field for v3 inodes. + */ +#define XFS_DINODE_MAGIC 0x494e /* 'IN' */ +#define XFS_DINODE_GOOD_VERSION(v) ((v) >= 1 && (v) <= 3) +typedef struct xfs_dinode { + __be16 di_magic; /* inode magic # = XFS_DINODE_MAGIC */ + __be16 di_mode; /* mode and type of file */ + __u8 di_version; /* inode version */ + __u8 di_format; /* format of di_c data */ + __be16 di_onlink; /* old number of links to file */ + __be32 di_uid; /* owner's user id */ + __be32 di_gid; /* owner's group id */ + __be32 di_nlink; /* number of links to file */ + __be16 di_projid_lo; /* lower part of owner's project id */ + __be16 di_projid_hi; /* higher part owner's project id */ + __u8 di_pad[6]; /* unused, zeroed space */ + __be16 di_flushiter; /* incremented on flush */ + xfs_timestamp_t di_atime; /* time last accessed */ + xfs_timestamp_t di_mtime; /* time last modified */ + xfs_timestamp_t di_ctime; /* time created/inode modified */ + __be64 di_size; /* number of bytes in file */ + __be64 di_nblocks; /* # of direct & btree blocks used */ + __be32 di_extsize; /* basic/minimum extent size for file */ + __be32 di_nextents; /* number of extents in data fork */ + __be16 di_anextents; /* number of extents in attribute fork*/ + __u8 di_forkoff; /* attr fork offs, <<3 for 64b align */ + __s8 di_aformat; /* format of attr fork's data */ + __be32 di_dmevmask; /* DMIG event mask */ + __be16 di_dmstate; /* DMIG state info */ + __be16 di_flags; /* random flags, XFS_DIFLAG_... */ + __be32 di_gen; /* generation number */ + + /* di_next_unlinked is the only non-core field in the old dinode */ + __be32 di_next_unlinked;/* agi unlinked list ptr */ + + /* start of the extended dinode, writable fields */ + __le32 di_crc; /* CRC of the inode */ + __be64 di_changecount; /* number of attribute changes */ + __be64 di_lsn; /* flush sequence */ + __be64 di_flags2; /* more random flags */ + __u8 di_pad2[16]; /* more padding for future expansion */ + + /* fields only written to during inode creation */ + xfs_timestamp_t di_crtime; /* time created */ + __be64 di_ino; /* inode number */ + uuid_t di_uuid; /* UUID of the filesystem */ + + /* structure must be padded to 64 bit alignment */ +} xfs_dinode_t; + +#define XFS_DINODE_CRC_OFF offsetof(struct xfs_dinode, di_crc) + +#define DI_MAX_FLUSH 0xffff + +/* + * Size of the core inode on disk. Version 1 and 2 inodes have + * the same size, but version 3 has grown a few additional fields. + */ +static inline uint xfs_dinode_size(int version) +{ + if (version == 3) + return sizeof(struct xfs_dinode); + return offsetof(struct xfs_dinode, di_crc); +} + +/* + * The 32 bit link count in the inode theoretically maxes out at UINT_MAX. + * Since the pathconf interface is signed, we use 2^31 - 1 instead. + * The old inode format had a 16 bit link count, so its maximum is USHRT_MAX. + */ +#define XFS_MAXLINK ((1U << 31) - 1U) +#define XFS_MAXLINK_1 65535U + +/* + * Values for di_format + */ +typedef enum xfs_dinode_fmt { + XFS_DINODE_FMT_DEV, /* xfs_dev_t */ + XFS_DINODE_FMT_LOCAL, /* bulk data */ + XFS_DINODE_FMT_EXTENTS, /* struct xfs_bmbt_rec */ + XFS_DINODE_FMT_BTREE, /* struct xfs_bmdr_block */ + XFS_DINODE_FMT_UUID /* uuid_t */ +} xfs_dinode_fmt_t; + +/* + * Inode minimum and maximum sizes. + */ +#define XFS_DINODE_MIN_LOG 8 +#define XFS_DINODE_MAX_LOG 11 +#define XFS_DINODE_MIN_SIZE (1 << XFS_DINODE_MIN_LOG) +#define XFS_DINODE_MAX_SIZE (1 << XFS_DINODE_MAX_LOG) + +/* + * Inode size for given fs. + */ +#define XFS_LITINO(mp, version) \ + ((int)(((mp)->m_sb.sb_inodesize) - xfs_dinode_size(version))) + +/* + * Inode data & attribute fork sizes, per inode. + */ +#define XFS_DFORK_Q(dip) ((dip)->di_forkoff != 0) +#define XFS_DFORK_BOFF(dip) ((int)((dip)->di_forkoff << 3)) + +#define XFS_DFORK_DSIZE(dip,mp) \ + (XFS_DFORK_Q(dip) ? \ + XFS_DFORK_BOFF(dip) : \ + XFS_LITINO(mp, (dip)->di_version)) +#define XFS_DFORK_ASIZE(dip,mp) \ + (XFS_DFORK_Q(dip) ? \ + XFS_LITINO(mp, (dip)->di_version) - XFS_DFORK_BOFF(dip) : \ + 0) +#define XFS_DFORK_SIZE(dip,mp,w) \ + ((w) == XFS_DATA_FORK ? \ + XFS_DFORK_DSIZE(dip, mp) : \ + XFS_DFORK_ASIZE(dip, mp)) + +/* + * Return pointers to the data or attribute forks. + */ +#define XFS_DFORK_DPTR(dip) \ + ((char *)dip + xfs_dinode_size(dip->di_version)) +#define XFS_DFORK_APTR(dip) \ + (XFS_DFORK_DPTR(dip) + XFS_DFORK_BOFF(dip)) +#define XFS_DFORK_PTR(dip,w) \ + ((w) == XFS_DATA_FORK ? XFS_DFORK_DPTR(dip) : XFS_DFORK_APTR(dip)) + +#define XFS_DFORK_FORMAT(dip,w) \ + ((w) == XFS_DATA_FORK ? \ + (dip)->di_format : \ + (dip)->di_aformat) +#define XFS_DFORK_NEXTENTS(dip,w) \ + ((w) == XFS_DATA_FORK ? \ + be32_to_cpu((dip)->di_nextents) : \ + be16_to_cpu((dip)->di_anextents)) + +/* + * For block and character special files the 32bit dev_t is stored at the + * beginning of the data fork. + */ +static inline xfs_dev_t xfs_dinode_get_rdev(struct xfs_dinode *dip) +{ + return be32_to_cpu(*(__be32 *)XFS_DFORK_DPTR(dip)); +} + +static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev) +{ + *(__be32 *)XFS_DFORK_DPTR(dip) = cpu_to_be32(rdev); +} + +/* + * Values for di_flags + * There should be a one-to-one correspondence between these flags and the + * XFS_XFLAG_s. + */ +#define XFS_DIFLAG_REALTIME_BIT 0 /* file's blocks come from rt area */ +#define XFS_DIFLAG_PREALLOC_BIT 1 /* file space has been preallocated */ +#define XFS_DIFLAG_NEWRTBM_BIT 2 /* for rtbitmap inode, new format */ +#define XFS_DIFLAG_IMMUTABLE_BIT 3 /* inode is immutable */ +#define XFS_DIFLAG_APPEND_BIT 4 /* inode is append-only */ +#define XFS_DIFLAG_SYNC_BIT 5 /* inode is written synchronously */ +#define XFS_DIFLAG_NOATIME_BIT 6 /* do not update atime */ +#define XFS_DIFLAG_NODUMP_BIT 7 /* do not dump */ +#define XFS_DIFLAG_RTINHERIT_BIT 8 /* create with realtime bit set */ +#define XFS_DIFLAG_PROJINHERIT_BIT 9 /* create with parents projid */ +#define XFS_DIFLAG_NOSYMLINKS_BIT 10 /* disallow symlink creation */ +#define XFS_DIFLAG_EXTSIZE_BIT 11 /* inode extent size allocator hint */ +#define XFS_DIFLAG_EXTSZINHERIT_BIT 12 /* inherit inode extent size */ +#define XFS_DIFLAG_NODEFRAG_BIT 13 /* do not reorganize/defragment */ +#define XFS_DIFLAG_FILESTREAM_BIT 14 /* use filestream allocator */ +#define XFS_DIFLAG_REALTIME (1 << XFS_DIFLAG_REALTIME_BIT) +#define XFS_DIFLAG_PREALLOC (1 << XFS_DIFLAG_PREALLOC_BIT) +#define XFS_DIFLAG_NEWRTBM (1 << XFS_DIFLAG_NEWRTBM_BIT) +#define XFS_DIFLAG_IMMUTABLE (1 << XFS_DIFLAG_IMMUTABLE_BIT) +#define XFS_DIFLAG_APPEND (1 << XFS_DIFLAG_APPEND_BIT) +#define XFS_DIFLAG_SYNC (1 << XFS_DIFLAG_SYNC_BIT) +#define XFS_DIFLAG_NOATIME (1 << XFS_DIFLAG_NOATIME_BIT) +#define XFS_DIFLAG_NODUMP (1 << XFS_DIFLAG_NODUMP_BIT) +#define XFS_DIFLAG_RTINHERIT (1 << XFS_DIFLAG_RTINHERIT_BIT) +#define XFS_DIFLAG_PROJINHERIT (1 << XFS_DIFLAG_PROJINHERIT_BIT) +#define XFS_DIFLAG_NOSYMLINKS (1 << XFS_DIFLAG_NOSYMLINKS_BIT) +#define XFS_DIFLAG_EXTSIZE (1 << XFS_DIFLAG_EXTSIZE_BIT) +#define XFS_DIFLAG_EXTSZINHERIT (1 << XFS_DIFLAG_EXTSZINHERIT_BIT) +#define XFS_DIFLAG_NODEFRAG (1 << XFS_DIFLAG_NODEFRAG_BIT) +#define XFS_DIFLAG_FILESTREAM (1 << XFS_DIFLAG_FILESTREAM_BIT) + +#define XFS_DIFLAG_ANY \ + (XFS_DIFLAG_REALTIME | XFS_DIFLAG_PREALLOC | XFS_DIFLAG_NEWRTBM | \ + XFS_DIFLAG_IMMUTABLE | XFS_DIFLAG_APPEND | XFS_DIFLAG_SYNC | \ + XFS_DIFLAG_NOATIME | XFS_DIFLAG_NODUMP | XFS_DIFLAG_RTINHERIT | \ + XFS_DIFLAG_PROJINHERIT | XFS_DIFLAG_NOSYMLINKS | XFS_DIFLAG_EXTSIZE | \ + XFS_DIFLAG_EXTSZINHERIT | XFS_DIFLAG_NODEFRAG | XFS_DIFLAG_FILESTREAM) + +/* + * Inode number format: + * low inopblog bits - offset in block + * next agblklog bits - block number in ag + * next agno_log bits - ag number + * high agno_log-agblklog-inopblog bits - 0 + */ +#define XFS_INO_MASK(k) (__uint32_t)((1ULL << (k)) - 1) +#define XFS_INO_OFFSET_BITS(mp) (mp)->m_sb.sb_inopblog +#define XFS_INO_AGBNO_BITS(mp) (mp)->m_sb.sb_agblklog +#define XFS_INO_AGINO_BITS(mp) (mp)->m_agino_log +#define XFS_INO_AGNO_BITS(mp) (mp)->m_agno_log +#define XFS_INO_BITS(mp) \ + XFS_INO_AGNO_BITS(mp) + XFS_INO_AGINO_BITS(mp) +#define XFS_INO_TO_AGNO(mp,i) \ + ((xfs_agnumber_t)((i) >> XFS_INO_AGINO_BITS(mp))) +#define XFS_INO_TO_AGINO(mp,i) \ + ((xfs_agino_t)(i) & XFS_INO_MASK(XFS_INO_AGINO_BITS(mp))) +#define XFS_INO_TO_AGBNO(mp,i) \ + (((xfs_agblock_t)(i) >> XFS_INO_OFFSET_BITS(mp)) & \ + XFS_INO_MASK(XFS_INO_AGBNO_BITS(mp))) +#define XFS_INO_TO_OFFSET(mp,i) \ + ((int)(i) & XFS_INO_MASK(XFS_INO_OFFSET_BITS(mp))) +#define XFS_INO_TO_FSB(mp,i) \ + XFS_AGB_TO_FSB(mp, XFS_INO_TO_AGNO(mp,i), XFS_INO_TO_AGBNO(mp,i)) +#define XFS_AGINO_TO_INO(mp,a,i) \ + (((xfs_ino_t)(a) << XFS_INO_AGINO_BITS(mp)) | (i)) +#define XFS_AGINO_TO_AGBNO(mp,i) ((i) >> XFS_INO_OFFSET_BITS(mp)) +#define XFS_AGINO_TO_OFFSET(mp,i) \ + ((i) & XFS_INO_MASK(XFS_INO_OFFSET_BITS(mp))) +#define XFS_OFFBNO_TO_AGINO(mp,b,o) \ + ((xfs_agino_t)(((b) << XFS_INO_OFFSET_BITS(mp)) | (o))) + +#define XFS_MAXINUMBER ((xfs_ino_t)((1ULL << 56) - 1ULL)) +#define XFS_MAXINUMBER_32 ((xfs_ino_t)((1ULL << 32) - 1ULL)) + +/* + * RealTime Device format definitions + */ + +/* Min and max rt extent sizes, specified in bytes */ +#define XFS_MAX_RTEXTSIZE (1024 * 1024 * 1024) /* 1GB */ +#define XFS_DFL_RTEXTSIZE (64 * 1024) /* 64kB */ +#define XFS_MIN_RTEXTSIZE (4 * 1024) /* 4kB */ + +#define XFS_BLOCKSIZE(mp) ((mp)->m_sb.sb_blocksize) +#define XFS_BLOCKMASK(mp) ((mp)->m_blockmask) +#define XFS_BLOCKWSIZE(mp) ((mp)->m_blockwsize) +#define XFS_BLOCKWMASK(mp) ((mp)->m_blockwmask) + +/* + * RT Summary and bit manipulation macros. + */ +#define XFS_SUMOFFS(mp,ls,bb) ((int)((ls) * (mp)->m_sb.sb_rbmblocks + (bb))) +#define XFS_SUMOFFSTOBLOCK(mp,s) \ + (((s) * (uint)sizeof(xfs_suminfo_t)) >> (mp)->m_sb.sb_blocklog) +#define XFS_SUMPTR(mp,bp,so) \ + ((xfs_suminfo_t *)((bp)->b_addr + \ + (((so) * (uint)sizeof(xfs_suminfo_t)) & XFS_BLOCKMASK(mp)))) + +#define XFS_BITTOBLOCK(mp,bi) ((bi) >> (mp)->m_blkbit_log) +#define XFS_BLOCKTOBIT(mp,bb) ((bb) << (mp)->m_blkbit_log) +#define XFS_BITTOWORD(mp,bi) \ + ((int)(((bi) >> XFS_NBWORDLOG) & XFS_BLOCKWMASK(mp))) + +#define XFS_RTMIN(a,b) ((a) < (b) ? (a) : (b)) +#define XFS_RTMAX(a,b) ((a) > (b) ? (a) : (b)) + +#define XFS_RTLOBIT(w) xfs_lowbit32(w) +#define XFS_RTHIBIT(w) xfs_highbit32(w) + +#define XFS_RTBLOCKLOG(b) xfs_highbit64(b) + +/* + * Dquot and dquot block format definitions + */ +#define XFS_DQUOT_MAGIC 0x4451 /* 'DQ' */ +#define XFS_DQUOT_VERSION (u_int8_t)0x01 /* latest version number */ + +/* + * This is the main portion of the on-disk representation of quota + * information for a user. This is the q_core of the xfs_dquot_t that + * is kept in kernel memory. We pad this with some more expansion room + * to construct the on disk structure. + */ +typedef struct xfs_disk_dquot { + __be16 d_magic; /* dquot magic = XFS_DQUOT_MAGIC */ + __u8 d_version; /* dquot version */ + __u8 d_flags; /* XFS_DQ_USER/PROJ/GROUP */ + __be32 d_id; /* user,project,group id */ + __be64 d_blk_hardlimit;/* absolute limit on disk blks */ + __be64 d_blk_softlimit;/* preferred limit on disk blks */ + __be64 d_ino_hardlimit;/* maximum # allocated inodes */ + __be64 d_ino_softlimit;/* preferred inode limit */ + __be64 d_bcount; /* disk blocks owned by the user */ + __be64 d_icount; /* inodes owned by the user */ + __be32 d_itimer; /* zero if within inode limits if not, + this is when we refuse service */ + __be32 d_btimer; /* similar to above; for disk blocks */ + __be16 d_iwarns; /* warnings issued wrt num inodes */ + __be16 d_bwarns; /* warnings issued wrt disk blocks */ + __be32 d_pad0; /* 64 bit align */ + __be64 d_rtb_hardlimit;/* absolute limit on realtime blks */ + __be64 d_rtb_softlimit;/* preferred limit on RT disk blks */ + __be64 d_rtbcount; /* realtime blocks owned */ + __be32 d_rtbtimer; /* similar to above; for RT disk blocks */ + __be16 d_rtbwarns; /* warnings issued wrt RT disk blocks */ + __be16 d_pad; +} xfs_disk_dquot_t; + +/* + * This is what goes on disk. This is separated from the xfs_disk_dquot because + * carrying the unnecessary padding would be a waste of memory. + */ +typedef struct xfs_dqblk { + xfs_disk_dquot_t dd_diskdq; /* portion that lives incore as well */ + char dd_fill[4]; /* filling for posterity */ + + /* + * These two are only present on filesystems with the CRC bits set. + */ + __be32 dd_crc; /* checksum */ + __be64 dd_lsn; /* last modification in log */ + uuid_t dd_uuid; /* location information */ +} xfs_dqblk_t; + +#define XFS_DQUOT_CRC_OFF offsetof(struct xfs_dqblk, dd_crc) + +/* + * Remote symlink format and access functions. + */ +#define XFS_SYMLINK_MAGIC 0x58534c4d /* XSLM */ + +struct xfs_dsymlink_hdr { + __be32 sl_magic; + __be32 sl_offset; + __be32 sl_bytes; + __be32 sl_crc; + uuid_t sl_uuid; + __be64 sl_owner; + __be64 sl_blkno; + __be64 sl_lsn; +}; + +#define XFS_SYMLINK_CRC_OFF offsetof(struct xfs_dsymlink_hdr, sl_crc) + +/* + * The maximum pathlen is 1024 bytes. Since the minimum file system + * blocksize is 512 bytes, we can get a max of 3 extents back from + * bmapi when crc headers are taken into account. + */ +#define XFS_SYMLINK_MAPS 3 + +#define XFS_SYMLINK_BUF_SPACE(mp, bufsize) \ + ((bufsize) - (xfs_sb_version_hascrc(&(mp)->m_sb) ? \ + sizeof(struct xfs_dsymlink_hdr) : 0)) + + +/* + * Allocation Btree format definitions + * + * There are two on-disk btrees, one sorted by blockno and one sorted + * by blockcount and blockno. All blocks look the same to make the code + * simpler; if we have time later, we'll make the optimizations. + */ +#define XFS_ABTB_MAGIC 0x41425442 /* 'ABTB' for bno tree */ +#define XFS_ABTB_CRC_MAGIC 0x41423342 /* 'AB3B' */ +#define XFS_ABTC_MAGIC 0x41425443 /* 'ABTC' for cnt tree */ +#define XFS_ABTC_CRC_MAGIC 0x41423343 /* 'AB3C' */ + +/* + * Data record/key structure + */ +typedef struct xfs_alloc_rec { + __be32 ar_startblock; /* starting block number */ + __be32 ar_blockcount; /* count of free blocks */ +} xfs_alloc_rec_t, xfs_alloc_key_t; + +typedef struct xfs_alloc_rec_incore { + xfs_agblock_t ar_startblock; /* starting block number */ + xfs_extlen_t ar_blockcount; /* count of free blocks */ +} xfs_alloc_rec_incore_t; + +/* btree pointer type */ +typedef __be32 xfs_alloc_ptr_t; + +/* + * Block numbers in the AG: + * SB is sector 0, AGF is sector 1, AGI is sector 2, AGFL is sector 3. + */ +#define XFS_BNO_BLOCK(mp) ((xfs_agblock_t)(XFS_AGFL_BLOCK(mp) + 1)) +#define XFS_CNT_BLOCK(mp) ((xfs_agblock_t)(XFS_BNO_BLOCK(mp) + 1)) + + +/* + * Inode Allocation Btree format definitions + * + * There is a btree for the inode map per allocation group. + */ +#define XFS_IBT_MAGIC 0x49414254 /* 'IABT' */ +#define XFS_IBT_CRC_MAGIC 0x49414233 /* 'IAB3' */ +#define XFS_FIBT_MAGIC 0x46494254 /* 'FIBT' */ +#define XFS_FIBT_CRC_MAGIC 0x46494233 /* 'FIB3' */ + +typedef __uint64_t xfs_inofree_t; +#define XFS_INODES_PER_CHUNK (NBBY * sizeof(xfs_inofree_t)) +#define XFS_INODES_PER_CHUNK_LOG (XFS_NBBYLOG + 3) +#define XFS_INOBT_ALL_FREE ((xfs_inofree_t)-1) +#define XFS_INOBT_MASK(i) ((xfs_inofree_t)1 << (i)) + +static inline xfs_inofree_t xfs_inobt_maskn(int i, int n) +{ + return ((n >= XFS_INODES_PER_CHUNK ? 0 : XFS_INOBT_MASK(n)) - 1) << i; +} + +/* + * Data record structure + */ +typedef struct xfs_inobt_rec { + __be32 ir_startino; /* starting inode number */ + __be32 ir_freecount; /* count of free inodes (set bits) */ + __be64 ir_free; /* free inode mask */ +} xfs_inobt_rec_t; + +typedef struct xfs_inobt_rec_incore { + xfs_agino_t ir_startino; /* starting inode number */ + __int32_t ir_freecount; /* count of free inodes (set bits) */ + xfs_inofree_t ir_free; /* free inode mask */ +} xfs_inobt_rec_incore_t; + + +/* + * Key structure + */ +typedef struct xfs_inobt_key { + __be32 ir_startino; /* starting inode number */ +} xfs_inobt_key_t; + +/* btree pointer type */ +typedef __be32 xfs_inobt_ptr_t; + +/* + * block numbers in the AG. + */ +#define XFS_IBT_BLOCK(mp) ((xfs_agblock_t)(XFS_CNT_BLOCK(mp) + 1)) +#define XFS_FIBT_BLOCK(mp) ((xfs_agblock_t)(XFS_IBT_BLOCK(mp) + 1)) + +/* + * The first data block of an AG depends on whether the filesystem was formatted + * with the finobt feature. If so, account for the finobt reserved root btree + * block. + */ +#define XFS_PREALLOC_BLOCKS(mp) \ + (xfs_sb_version_hasfinobt(&((mp)->m_sb)) ? \ + XFS_FIBT_BLOCK(mp) + 1 : \ + XFS_IBT_BLOCK(mp) + 1) + + + +/* + * BMAP Btree format definitions + * + * This includes both the root block definition that sits inside an inode fork + * and the record/pointer formats for the leaf/node in the blocks. + */ +#define XFS_BMAP_MAGIC 0x424d4150 /* 'BMAP' */ +#define XFS_BMAP_CRC_MAGIC 0x424d4133 /* 'BMA3' */ + +/* + * Bmap root header, on-disk form only. + */ +typedef struct xfs_bmdr_block { + __be16 bb_level; /* 0 is a leaf */ + __be16 bb_numrecs; /* current # of data records */ +} xfs_bmdr_block_t; + +/* + * Bmap btree record and extent descriptor. + * l0:63 is an extent flag (value 1 indicates non-normal). + * l0:9-62 are startoff. + * l0:0-8 and l1:21-63 are startblock. + * l1:0-20 are blockcount. + */ +#define BMBT_EXNTFLAG_BITLEN 1 +#define BMBT_STARTOFF_BITLEN 54 +#define BMBT_STARTBLOCK_BITLEN 52 +#define BMBT_BLOCKCOUNT_BITLEN 21 + +typedef struct xfs_bmbt_rec { + __be64 l0, l1; +} xfs_bmbt_rec_t; + +typedef __uint64_t xfs_bmbt_rec_base_t; /* use this for casts */ +typedef xfs_bmbt_rec_t xfs_bmdr_rec_t; + +typedef struct xfs_bmbt_rec_host { + __uint64_t l0, l1; +} xfs_bmbt_rec_host_t; + +/* + * Values and macros for delayed-allocation startblock fields. + */ +#define STARTBLOCKVALBITS 17 +#define STARTBLOCKMASKBITS (15 + 20) +#define STARTBLOCKMASK \ + (((((xfs_fsblock_t)1) << STARTBLOCKMASKBITS) - 1) << STARTBLOCKVALBITS) + +static inline int isnullstartblock(xfs_fsblock_t x) +{ + return ((x) & STARTBLOCKMASK) == STARTBLOCKMASK; +} + +static inline xfs_fsblock_t nullstartblock(int k) +{ + ASSERT(k < (1 << STARTBLOCKVALBITS)); + return STARTBLOCKMASK | (k); +} + +static inline xfs_filblks_t startblockval(xfs_fsblock_t x) +{ + return (xfs_filblks_t)((x) & ~STARTBLOCKMASK); +} + +/* + * Possible extent formats. + */ +typedef enum { + XFS_EXTFMT_NOSTATE = 0, + XFS_EXTFMT_HASSTATE +} xfs_exntfmt_t; + +/* + * Possible extent states. + */ +typedef enum { + XFS_EXT_NORM, XFS_EXT_UNWRITTEN, + XFS_EXT_DMAPI_OFFLINE, XFS_EXT_INVALID +} xfs_exntst_t; + +/* + * Incore version of above. + */ +typedef struct xfs_bmbt_irec +{ + xfs_fileoff_t br_startoff; /* starting file offset */ + xfs_fsblock_t br_startblock; /* starting block number */ + xfs_filblks_t br_blockcount; /* number of blocks */ + xfs_exntst_t br_state; /* extent state */ +} xfs_bmbt_irec_t; + +/* + * Key structure for non-leaf levels of the tree. + */ +typedef struct xfs_bmbt_key { + __be64 br_startoff; /* starting file offset */ +} xfs_bmbt_key_t, xfs_bmdr_key_t; + +/* btree pointer type */ +typedef __be64 xfs_bmbt_ptr_t, xfs_bmdr_ptr_t; + + +/* + * Generic Btree block format definitions + * + * This is a combination of the actual format used on disk for short and long + * format btrees. The first three fields are shared by both format, but the + * pointers are different and should be used with care. + * + * To get the size of the actual short or long form headers please use the size + * macros below. Never use sizeof(xfs_btree_block). + * + * The blkno, crc, lsn, owner and uuid fields are only available in filesystems + * with the crc feature bit, and all accesses to them must be conditional on + * that flag. + */ +struct xfs_btree_block { + __be32 bb_magic; /* magic number for block type */ + __be16 bb_level; /* 0 is a leaf */ + __be16 bb_numrecs; /* current # of data records */ + union { + struct { + __be32 bb_leftsib; + __be32 bb_rightsib; + + __be64 bb_blkno; + __be64 bb_lsn; + uuid_t bb_uuid; + __be32 bb_owner; + __le32 bb_crc; + } s; /* short form pointers */ + struct { + __be64 bb_leftsib; + __be64 bb_rightsib; + + __be64 bb_blkno; + __be64 bb_lsn; + uuid_t bb_uuid; + __be64 bb_owner; + __le32 bb_crc; + __be32 bb_pad; /* padding for alignment */ + } l; /* long form pointers */ + } bb_u; /* rest */ +}; + +#define XFS_BTREE_SBLOCK_LEN 16 /* size of a short form block */ +#define XFS_BTREE_LBLOCK_LEN 24 /* size of a long form block */ + +/* sizes of CRC enabled btree blocks */ +#define XFS_BTREE_SBLOCK_CRC_LEN (XFS_BTREE_SBLOCK_LEN + 40) +#define XFS_BTREE_LBLOCK_CRC_LEN (XFS_BTREE_LBLOCK_LEN + 48) + +#define XFS_BTREE_SBLOCK_CRC_OFF \ + offsetof(struct xfs_btree_block, bb_u.s.bb_crc) +#define XFS_BTREE_LBLOCK_CRC_OFF \ + offsetof(struct xfs_btree_block, bb_u.l.bb_crc) + +/* + * On-disk XFS access control list structure. + */ +struct xfs_acl_entry { + __be32 ae_tag; + __be32 ae_id; + __be16 ae_perm; + __be16 ae_pad; /* fill the implicit hole in the structure */ +}; + +struct xfs_acl { + __be32 acl_cnt; + struct xfs_acl_entry acl_entry[0]; +}; + +/* + * The number of ACL entries allowed is defined by the on-disk format. + * For v4 superblocks, that is limited to 25 entries. For v5 superblocks, it is + * limited only by the maximum size of the xattr that stores the information. + */ +#define XFS_ACL_MAX_ENTRIES(mp) \ + (xfs_sb_version_hascrc(&mp->m_sb) \ + ? (XATTR_SIZE_MAX - sizeof(struct xfs_acl)) / \ + sizeof(struct xfs_acl_entry) \ + : 25) + +#define XFS_ACL_MAX_SIZE(mp) \ + (sizeof(struct xfs_acl) + \ + sizeof(struct xfs_acl_entry) * XFS_ACL_MAX_ENTRIES((mp))) + +/* On-disk XFS extended attribute names */ +#define SGI_ACL_FILE (unsigned char *)"SGI_ACL_FILE" +#define SGI_ACL_DEFAULT (unsigned char *)"SGI_ACL_DEFAULT" +#define SGI_ACL_FILE_SIZE (sizeof(SGI_ACL_FILE)-1) +#define SGI_ACL_DEFAULT_SIZE (sizeof(SGI_ACL_DEFAULT)-1) + +#endif /* __XFS_FORMAT_H__ */ diff --git a/kernel/fs/xfs/libxfs/xfs_fs.h b/kernel/fs/xfs/libxfs/xfs_fs.h new file mode 100644 index 000000000..18dc721ca --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_fs.h @@ -0,0 +1,576 @@ +/* + * Copyright (c) 1995-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_FS_H__ +#define __XFS_FS_H__ + +/* + * SGI's XFS filesystem's major stuff (constants, structures) + */ + +/* + * Direct I/O attribute record used with XFS_IOC_DIOINFO + * d_miniosz is the min xfer size, xfer size multiple and file seek offset + * alignment. + */ +#ifndef HAVE_DIOATTR +struct dioattr { + __u32 d_mem; /* data buffer memory alignment */ + __u32 d_miniosz; /* min xfer size */ + __u32 d_maxiosz; /* max xfer size */ +}; +#endif + +/* + * Structure for XFS_IOC_FSGETXATTR[A] and XFS_IOC_FSSETXATTR. + */ +#ifndef HAVE_FSXATTR +struct fsxattr { + __u32 fsx_xflags; /* xflags field value (get/set) */ + __u32 fsx_extsize; /* extsize field value (get/set)*/ + __u32 fsx_nextents; /* nextents field value (get) */ + __u32 fsx_projid; /* project identifier (get/set) */ + unsigned char fsx_pad[12]; +}; +#endif + +/* + * Flags for the bs_xflags/fsx_xflags field + * There should be a one-to-one correspondence between these flags and the + * XFS_DIFLAG_s. + */ +#define XFS_XFLAG_REALTIME 0x00000001 /* data in realtime volume */ +#define XFS_XFLAG_PREALLOC 0x00000002 /* preallocated file extents */ +#define XFS_XFLAG_IMMUTABLE 0x00000008 /* file cannot be modified */ +#define XFS_XFLAG_APPEND 0x00000010 /* all writes append */ +#define XFS_XFLAG_SYNC 0x00000020 /* all writes synchronous */ +#define XFS_XFLAG_NOATIME 0x00000040 /* do not update access time */ +#define XFS_XFLAG_NODUMP 0x00000080 /* do not include in backups */ +#define XFS_XFLAG_RTINHERIT 0x00000100 /* create with rt bit set */ +#define XFS_XFLAG_PROJINHERIT 0x00000200 /* create with parents projid */ +#define XFS_XFLAG_NOSYMLINKS 0x00000400 /* disallow symlink creation */ +#define XFS_XFLAG_EXTSIZE 0x00000800 /* extent size allocator hint */ +#define XFS_XFLAG_EXTSZINHERIT 0x00001000 /* inherit inode extent size */ +#define XFS_XFLAG_NODEFRAG 0x00002000 /* do not defragment */ +#define XFS_XFLAG_FILESTREAM 0x00004000 /* use filestream allocator */ +#define XFS_XFLAG_HASATTR 0x80000000 /* no DIFLAG for this */ + +/* + * Structure for XFS_IOC_GETBMAP. + * On input, fill in bmv_offset and bmv_length of the first structure + * to indicate the area of interest in the file, and bmv_entries with + * the number of array elements given back. The first structure is + * updated on return to give the offset and length for the next call. + */ +#ifndef HAVE_GETBMAP +struct getbmap { + __s64 bmv_offset; /* file offset of segment in blocks */ + __s64 bmv_block; /* starting block (64-bit daddr_t) */ + __s64 bmv_length; /* length of segment, blocks */ + __s32 bmv_count; /* # of entries in array incl. 1st */ + __s32 bmv_entries; /* # of entries filled in (output) */ +}; +#endif + +/* + * Structure for XFS_IOC_GETBMAPX. Fields bmv_offset through bmv_entries + * are used exactly as in the getbmap structure. The getbmapx structure + * has additional bmv_iflags and bmv_oflags fields. The bmv_iflags field + * is only used for the first structure. It contains input flags + * specifying XFS_IOC_GETBMAPX actions. The bmv_oflags field is filled + * in by the XFS_IOC_GETBMAPX command for each returned structure after + * the first. + */ +#ifndef HAVE_GETBMAPX +struct getbmapx { + __s64 bmv_offset; /* file offset of segment in blocks */ + __s64 bmv_block; /* starting block (64-bit daddr_t) */ + __s64 bmv_length; /* length of segment, blocks */ + __s32 bmv_count; /* # of entries in array incl. 1st */ + __s32 bmv_entries; /* # of entries filled in (output). */ + __s32 bmv_iflags; /* input flags (1st structure) */ + __s32 bmv_oflags; /* output flags (after 1st structure)*/ + __s32 bmv_unused1; /* future use */ + __s32 bmv_unused2; /* future use */ +}; +#endif + +/* bmv_iflags values - set by XFS_IOC_GETBMAPX caller. */ +#define BMV_IF_ATTRFORK 0x1 /* return attr fork rather than data */ +#define BMV_IF_NO_DMAPI_READ 0x2 /* Do not generate DMAPI read event */ +#define BMV_IF_PREALLOC 0x4 /* rtn status BMV_OF_PREALLOC if req */ +#define BMV_IF_DELALLOC 0x8 /* rtn status BMV_OF_DELALLOC if req */ +#define BMV_IF_NO_HOLES 0x10 /* Do not return holes */ +#define BMV_IF_VALID \ + (BMV_IF_ATTRFORK|BMV_IF_NO_DMAPI_READ|BMV_IF_PREALLOC| \ + BMV_IF_DELALLOC|BMV_IF_NO_HOLES) + +/* bmv_oflags values - returned for each non-header segment */ +#define BMV_OF_PREALLOC 0x1 /* segment = unwritten pre-allocation */ +#define BMV_OF_DELALLOC 0x2 /* segment = delayed allocation */ +#define BMV_OF_LAST 0x4 /* segment is the last in the file */ + +/* + * Structure for XFS_IOC_FSSETDM. + * For use by backup and restore programs to set the XFS on-disk inode + * fields di_dmevmask and di_dmstate. These must be set to exactly and + * only values previously obtained via xfs_bulkstat! (Specifically the + * xfs_bstat_t fields bs_dmevmask and bs_dmstate.) + */ +#ifndef HAVE_FSDMIDATA +struct fsdmidata { + __u32 fsd_dmevmask; /* corresponds to di_dmevmask */ + __u16 fsd_padding; + __u16 fsd_dmstate; /* corresponds to di_dmstate */ +}; +#endif + +/* + * File segment locking set data type for 64 bit access. + * Also used for all the RESV/FREE interfaces. + */ +typedef struct xfs_flock64 { + __s16 l_type; + __s16 l_whence; + __s64 l_start; + __s64 l_len; /* len == 0 means until end of file */ + __s32 l_sysid; + __u32 l_pid; + __s32 l_pad[4]; /* reserve area */ +} xfs_flock64_t; + +/* + * Output for XFS_IOC_FSGEOMETRY_V1 + */ +typedef struct xfs_fsop_geom_v1 { + __u32 blocksize; /* filesystem (data) block size */ + __u32 rtextsize; /* realtime extent size */ + __u32 agblocks; /* fsblocks in an AG */ + __u32 agcount; /* number of allocation groups */ + __u32 logblocks; /* fsblocks in the log */ + __u32 sectsize; /* (data) sector size, bytes */ + __u32 inodesize; /* inode size in bytes */ + __u32 imaxpct; /* max allowed inode space(%) */ + __u64 datablocks; /* fsblocks in data subvolume */ + __u64 rtblocks; /* fsblocks in realtime subvol */ + __u64 rtextents; /* rt extents in realtime subvol*/ + __u64 logstart; /* starting fsblock of the log */ + unsigned char uuid[16]; /* unique id of the filesystem */ + __u32 sunit; /* stripe unit, fsblocks */ + __u32 swidth; /* stripe width, fsblocks */ + __s32 version; /* structure version */ + __u32 flags; /* superblock version flags */ + __u32 logsectsize; /* log sector size, bytes */ + __u32 rtsectsize; /* realtime sector size, bytes */ + __u32 dirblocksize; /* directory block size, bytes */ +} xfs_fsop_geom_v1_t; + +/* + * Output for XFS_IOC_FSGEOMETRY + */ +typedef struct xfs_fsop_geom { + __u32 blocksize; /* filesystem (data) block size */ + __u32 rtextsize; /* realtime extent size */ + __u32 agblocks; /* fsblocks in an AG */ + __u32 agcount; /* number of allocation groups */ + __u32 logblocks; /* fsblocks in the log */ + __u32 sectsize; /* (data) sector size, bytes */ + __u32 inodesize; /* inode size in bytes */ + __u32 imaxpct; /* max allowed inode space(%) */ + __u64 datablocks; /* fsblocks in data subvolume */ + __u64 rtblocks; /* fsblocks in realtime subvol */ + __u64 rtextents; /* rt extents in realtime subvol*/ + __u64 logstart; /* starting fsblock of the log */ + unsigned char uuid[16]; /* unique id of the filesystem */ + __u32 sunit; /* stripe unit, fsblocks */ + __u32 swidth; /* stripe width, fsblocks */ + __s32 version; /* structure version */ + __u32 flags; /* superblock version flags */ + __u32 logsectsize; /* log sector size, bytes */ + __u32 rtsectsize; /* realtime sector size, bytes */ + __u32 dirblocksize; /* directory block size, bytes */ + __u32 logsunit; /* log stripe unit, bytes */ +} xfs_fsop_geom_t; + +/* Output for XFS_FS_COUNTS */ +typedef struct xfs_fsop_counts { + __u64 freedata; /* free data section blocks */ + __u64 freertx; /* free rt extents */ + __u64 freeino; /* free inodes */ + __u64 allocino; /* total allocated inodes */ +} xfs_fsop_counts_t; + +/* Input/Output for XFS_GET_RESBLKS and XFS_SET_RESBLKS */ +typedef struct xfs_fsop_resblks { + __u64 resblks; + __u64 resblks_avail; +} xfs_fsop_resblks_t; + +#define XFS_FSOP_GEOM_VERSION 0 + +#define XFS_FSOP_GEOM_FLAGS_ATTR 0x0001 /* attributes in use */ +#define XFS_FSOP_GEOM_FLAGS_NLINK 0x0002 /* 32-bit nlink values */ +#define XFS_FSOP_GEOM_FLAGS_QUOTA 0x0004 /* quotas enabled */ +#define XFS_FSOP_GEOM_FLAGS_IALIGN 0x0008 /* inode alignment */ +#define XFS_FSOP_GEOM_FLAGS_DALIGN 0x0010 /* large data alignment */ +#define XFS_FSOP_GEOM_FLAGS_SHARED 0x0020 /* read-only shared */ +#define XFS_FSOP_GEOM_FLAGS_EXTFLG 0x0040 /* special extent flag */ +#define XFS_FSOP_GEOM_FLAGS_DIRV2 0x0080 /* directory version 2 */ +#define XFS_FSOP_GEOM_FLAGS_LOGV2 0x0100 /* log format version 2 */ +#define XFS_FSOP_GEOM_FLAGS_SECTOR 0x0200 /* sector sizes >1BB */ +#define XFS_FSOP_GEOM_FLAGS_ATTR2 0x0400 /* inline attributes rework */ +#define XFS_FSOP_GEOM_FLAGS_PROJID32 0x0800 /* 32-bit project IDs */ +#define XFS_FSOP_GEOM_FLAGS_DIRV2CI 0x1000 /* ASCII only CI names */ +#define XFS_FSOP_GEOM_FLAGS_LAZYSB 0x4000 /* lazy superblock counters */ +#define XFS_FSOP_GEOM_FLAGS_V5SB 0x8000 /* version 5 superblock */ +#define XFS_FSOP_GEOM_FLAGS_FTYPE 0x10000 /* inode directory types */ +#define XFS_FSOP_GEOM_FLAGS_FINOBT 0x20000 /* free inode btree */ + +/* + * Minimum and maximum sizes need for growth checks. + * + * Block counts are in units of filesystem blocks, not basic blocks. + */ +#define XFS_MIN_AG_BLOCKS 64 +#define XFS_MIN_LOG_BLOCKS 512ULL +#define XFS_MAX_LOG_BLOCKS (1024 * 1024ULL) +#define XFS_MIN_LOG_BYTES (10 * 1024 * 1024ULL) + +/* keep the maximum size under 2^31 by a small amount */ +#define XFS_MAX_LOG_BYTES \ + ((2 * 1024 * 1024 * 1024ULL) - XFS_MIN_LOG_BYTES) + +/* Used for sanity checks on superblock */ +#define XFS_MAX_DBLOCKS(s) ((xfs_rfsblock_t)(s)->sb_agcount * (s)->sb_agblocks) +#define XFS_MIN_DBLOCKS(s) ((xfs_rfsblock_t)((s)->sb_agcount - 1) * \ + (s)->sb_agblocks + XFS_MIN_AG_BLOCKS) + +/* + * Structures for XFS_IOC_FSGROWFSDATA, XFS_IOC_FSGROWFSLOG & XFS_IOC_FSGROWFSRT + */ +typedef struct xfs_growfs_data { + __u64 newblocks; /* new data subvol size, fsblocks */ + __u32 imaxpct; /* new inode space percentage limit */ +} xfs_growfs_data_t; + +typedef struct xfs_growfs_log { + __u32 newblocks; /* new log size, fsblocks */ + __u32 isint; /* 1 if new log is internal */ +} xfs_growfs_log_t; + +typedef struct xfs_growfs_rt { + __u64 newblocks; /* new realtime size, fsblocks */ + __u32 extsize; /* new realtime extent size, fsblocks */ +} xfs_growfs_rt_t; + + +/* + * Structures returned from ioctl XFS_IOC_FSBULKSTAT & XFS_IOC_FSBULKSTAT_SINGLE + */ +typedef struct xfs_bstime { + time_t tv_sec; /* seconds */ + __s32 tv_nsec; /* and nanoseconds */ +} xfs_bstime_t; + +typedef struct xfs_bstat { + __u64 bs_ino; /* inode number */ + __u16 bs_mode; /* type and mode */ + __u16 bs_nlink; /* number of links */ + __u32 bs_uid; /* user id */ + __u32 bs_gid; /* group id */ + __u32 bs_rdev; /* device value */ + __s32 bs_blksize; /* block size */ + __s64 bs_size; /* file size */ + xfs_bstime_t bs_atime; /* access time */ + xfs_bstime_t bs_mtime; /* modify time */ + xfs_bstime_t bs_ctime; /* inode change time */ + int64_t bs_blocks; /* number of blocks */ + __u32 bs_xflags; /* extended flags */ + __s32 bs_extsize; /* extent size */ + __s32 bs_extents; /* number of extents */ + __u32 bs_gen; /* generation count */ + __u16 bs_projid_lo; /* lower part of project id */ +#define bs_projid bs_projid_lo /* (previously just bs_projid) */ + __u16 bs_forkoff; /* inode fork offset in bytes */ + __u16 bs_projid_hi; /* higher part of project id */ + unsigned char bs_pad[10]; /* pad space, unused */ + __u32 bs_dmevmask; /* DMIG event mask */ + __u16 bs_dmstate; /* DMIG state info */ + __u16 bs_aextents; /* attribute number of extents */ +} xfs_bstat_t; + +/* + * Project quota id helpers (previously projid was 16bit only + * and using two 16bit values to hold new 32bit projid was choosen + * to retain compatibility with "old" filesystems). + */ +static inline __uint32_t +bstat_get_projid(struct xfs_bstat *bs) +{ + return (__uint32_t)bs->bs_projid_hi << 16 | bs->bs_projid_lo; +} + +/* + * The user-level BulkStat Request interface structure. + */ +typedef struct xfs_fsop_bulkreq { + __u64 __user *lastip; /* last inode # pointer */ + __s32 icount; /* count of entries in buffer */ + void __user *ubuffer;/* user buffer for inode desc. */ + __s32 __user *ocount; /* output count pointer */ +} xfs_fsop_bulkreq_t; + + +/* + * Structures returned from xfs_inumbers routine (XFS_IOC_FSINUMBERS). + */ +typedef struct xfs_inogrp { + __u64 xi_startino; /* starting inode number */ + __s32 xi_alloccount; /* # bits set in allocmask */ + __u64 xi_allocmask; /* mask of allocated inodes */ +} xfs_inogrp_t; + + +/* + * Error injection. + */ +typedef struct xfs_error_injection { + __s32 fd; + __s32 errtag; +} xfs_error_injection_t; + + +/* + * Speculative preallocation trimming. + */ +#define XFS_EOFBLOCKS_VERSION 1 +struct xfs_fs_eofblocks { + __u32 eof_version; + __u32 eof_flags; + uid_t eof_uid; + gid_t eof_gid; + prid_t eof_prid; + __u32 pad32; + __u64 eof_min_file_size; + __u64 pad64[12]; +}; + +/* eof_flags values */ +#define XFS_EOF_FLAGS_SYNC (1 << 0) /* sync/wait mode scan */ +#define XFS_EOF_FLAGS_UID (1 << 1) /* filter by uid */ +#define XFS_EOF_FLAGS_GID (1 << 2) /* filter by gid */ +#define XFS_EOF_FLAGS_PRID (1 << 3) /* filter by project id */ +#define XFS_EOF_FLAGS_MINFILESIZE (1 << 4) /* filter by min file size */ +#define XFS_EOF_FLAGS_UNION (1 << 5) /* union filter algorithm; + * kernel only, not included in + * valid mask */ +#define XFS_EOF_FLAGS_VALID \ + (XFS_EOF_FLAGS_SYNC | \ + XFS_EOF_FLAGS_UID | \ + XFS_EOF_FLAGS_GID | \ + XFS_EOF_FLAGS_PRID | \ + XFS_EOF_FLAGS_MINFILESIZE) + + +/* + * The user-level Handle Request interface structure. + */ +typedef struct xfs_fsop_handlereq { + __u32 fd; /* fd for FD_TO_HANDLE */ + void __user *path; /* user pathname */ + __u32 oflags; /* open flags */ + void __user *ihandle;/* user supplied handle */ + __u32 ihandlen; /* user supplied length */ + void __user *ohandle;/* user buffer for handle */ + __u32 __user *ohandlen;/* user buffer length */ +} xfs_fsop_handlereq_t; + +/* + * Compound structures for passing args through Handle Request interfaces + * xfs_fssetdm_by_handle, xfs_attrlist_by_handle, xfs_attrmulti_by_handle + * - ioctls: XFS_IOC_FSSETDM_BY_HANDLE, XFS_IOC_ATTRLIST_BY_HANDLE, and + * XFS_IOC_ATTRMULTI_BY_HANDLE + */ + +typedef struct xfs_fsop_setdm_handlereq { + struct xfs_fsop_handlereq hreq; /* handle information */ + struct fsdmidata __user *data; /* DMAPI data */ +} xfs_fsop_setdm_handlereq_t; + +typedef struct xfs_attrlist_cursor { + __u32 opaque[4]; +} xfs_attrlist_cursor_t; + +typedef struct xfs_fsop_attrlist_handlereq { + struct xfs_fsop_handlereq hreq; /* handle interface structure */ + struct xfs_attrlist_cursor pos; /* opaque cookie, list offset */ + __u32 flags; /* which namespace to use */ + __u32 buflen; /* length of buffer supplied */ + void __user *buffer; /* returned names */ +} xfs_fsop_attrlist_handlereq_t; + +typedef struct xfs_attr_multiop { + __u32 am_opcode; +#define ATTR_OP_GET 1 /* return the indicated attr's value */ +#define ATTR_OP_SET 2 /* set/create the indicated attr/value pair */ +#define ATTR_OP_REMOVE 3 /* remove the indicated attr */ + __s32 am_error; + void __user *am_attrname; + void __user *am_attrvalue; + __u32 am_length; + __u32 am_flags; +} xfs_attr_multiop_t; + +typedef struct xfs_fsop_attrmulti_handlereq { + struct xfs_fsop_handlereq hreq; /* handle interface structure */ + __u32 opcount;/* count of following multiop */ + struct xfs_attr_multiop __user *ops; /* attr_multi data */ +} xfs_fsop_attrmulti_handlereq_t; + +/* + * per machine unique filesystem identifier types. + */ +typedef struct { __u32 val[2]; } xfs_fsid_t; /* file system id type */ + +typedef struct xfs_fid { + __u16 fid_len; /* length of remainder */ + __u16 fid_pad; + __u32 fid_gen; /* generation number */ + __u64 fid_ino; /* 64 bits inode number */ +} xfs_fid_t; + +typedef struct xfs_handle { + union { + __s64 align; /* force alignment of ha_fid */ + xfs_fsid_t _ha_fsid; /* unique file system identifier */ + } ha_u; + xfs_fid_t ha_fid; /* file system specific file ID */ +} xfs_handle_t; +#define ha_fsid ha_u._ha_fsid + +#define XFS_HSIZE(handle) (((char *) &(handle).ha_fid.fid_pad \ + - (char *) &(handle)) \ + + (handle).ha_fid.fid_len) + +/* + * Structure passed to XFS_IOC_SWAPEXT + */ +typedef struct xfs_swapext +{ + __int64_t sx_version; /* version */ +#define XFS_SX_VERSION 0 + __int64_t sx_fdtarget; /* fd of target file */ + __int64_t sx_fdtmp; /* fd of tmp file */ + xfs_off_t sx_offset; /* offset into file */ + xfs_off_t sx_length; /* leng from offset */ + char sx_pad[16]; /* pad space, unused */ + xfs_bstat_t sx_stat; /* stat of target b4 copy */ +} xfs_swapext_t; + +/* + * Flags for going down operation + */ +#define XFS_FSOP_GOING_FLAGS_DEFAULT 0x0 /* going down */ +#define XFS_FSOP_GOING_FLAGS_LOGFLUSH 0x1 /* flush log but not data */ +#define XFS_FSOP_GOING_FLAGS_NOLOGFLUSH 0x2 /* don't flush log nor data */ + +/* + * ioctl commands that are used by Linux filesystems + */ +#define XFS_IOC_GETXFLAGS FS_IOC_GETFLAGS +#define XFS_IOC_SETXFLAGS FS_IOC_SETFLAGS +#define XFS_IOC_GETVERSION FS_IOC_GETVERSION + +/* + * ioctl commands that replace IRIX fcntl()'s + * For 'documentation' purposed more than anything else, + * the "cmd #" field reflects the IRIX fcntl number. + */ +#define XFS_IOC_ALLOCSP _IOW ('X', 10, struct xfs_flock64) +#define XFS_IOC_FREESP _IOW ('X', 11, struct xfs_flock64) +#define XFS_IOC_DIOINFO _IOR ('X', 30, struct dioattr) +#define XFS_IOC_FSGETXATTR _IOR ('X', 31, struct fsxattr) +#define XFS_IOC_FSSETXATTR _IOW ('X', 32, struct fsxattr) +#define XFS_IOC_ALLOCSP64 _IOW ('X', 36, struct xfs_flock64) +#define XFS_IOC_FREESP64 _IOW ('X', 37, struct xfs_flock64) +#define XFS_IOC_GETBMAP _IOWR('X', 38, struct getbmap) +#define XFS_IOC_FSSETDM _IOW ('X', 39, struct fsdmidata) +#define XFS_IOC_RESVSP _IOW ('X', 40, struct xfs_flock64) +#define XFS_IOC_UNRESVSP _IOW ('X', 41, struct xfs_flock64) +#define XFS_IOC_RESVSP64 _IOW ('X', 42, struct xfs_flock64) +#define XFS_IOC_UNRESVSP64 _IOW ('X', 43, struct xfs_flock64) +#define XFS_IOC_GETBMAPA _IOWR('X', 44, struct getbmap) +#define XFS_IOC_FSGETXATTRA _IOR ('X', 45, struct fsxattr) +/* XFS_IOC_SETBIOSIZE ---- deprecated 46 */ +/* XFS_IOC_GETBIOSIZE ---- deprecated 47 */ +#define XFS_IOC_GETBMAPX _IOWR('X', 56, struct getbmap) +#define XFS_IOC_ZERO_RANGE _IOW ('X', 57, struct xfs_flock64) +#define XFS_IOC_FREE_EOFBLOCKS _IOR ('X', 58, struct xfs_fs_eofblocks) + +/* + * ioctl commands that replace IRIX syssgi()'s + */ +#define XFS_IOC_FSGEOMETRY_V1 _IOR ('X', 100, struct xfs_fsop_geom_v1) +#define XFS_IOC_FSBULKSTAT _IOWR('X', 101, struct xfs_fsop_bulkreq) +#define XFS_IOC_FSBULKSTAT_SINGLE _IOWR('X', 102, struct xfs_fsop_bulkreq) +#define XFS_IOC_FSINUMBERS _IOWR('X', 103, struct xfs_fsop_bulkreq) +#define XFS_IOC_PATH_TO_FSHANDLE _IOWR('X', 104, struct xfs_fsop_handlereq) +#define XFS_IOC_PATH_TO_HANDLE _IOWR('X', 105, struct xfs_fsop_handlereq) +#define XFS_IOC_FD_TO_HANDLE _IOWR('X', 106, struct xfs_fsop_handlereq) +#define XFS_IOC_OPEN_BY_HANDLE _IOWR('X', 107, struct xfs_fsop_handlereq) +#define XFS_IOC_READLINK_BY_HANDLE _IOWR('X', 108, struct xfs_fsop_handlereq) +#define XFS_IOC_SWAPEXT _IOWR('X', 109, struct xfs_swapext) +#define XFS_IOC_FSGROWFSDATA _IOW ('X', 110, struct xfs_growfs_data) +#define XFS_IOC_FSGROWFSLOG _IOW ('X', 111, struct xfs_growfs_log) +#define XFS_IOC_FSGROWFSRT _IOW ('X', 112, struct xfs_growfs_rt) +#define XFS_IOC_FSCOUNTS _IOR ('X', 113, struct xfs_fsop_counts) +#define XFS_IOC_SET_RESBLKS _IOWR('X', 114, struct xfs_fsop_resblks) +#define XFS_IOC_GET_RESBLKS _IOR ('X', 115, struct xfs_fsop_resblks) +#define XFS_IOC_ERROR_INJECTION _IOW ('X', 116, struct xfs_error_injection) +#define XFS_IOC_ERROR_CLEARALL _IOW ('X', 117, struct xfs_error_injection) +/* XFS_IOC_ATTRCTL_BY_HANDLE -- deprecated 118 */ + +/* XFS_IOC_FREEZE -- FIFREEZE 119 */ +/* XFS_IOC_THAW -- FITHAW 120 */ +#ifndef FIFREEZE +#define XFS_IOC_FREEZE _IOWR('X', 119, int) +#define XFS_IOC_THAW _IOWR('X', 120, int) +#endif + +#define XFS_IOC_FSSETDM_BY_HANDLE _IOW ('X', 121, struct xfs_fsop_setdm_handlereq) +#define XFS_IOC_ATTRLIST_BY_HANDLE _IOW ('X', 122, struct xfs_fsop_attrlist_handlereq) +#define XFS_IOC_ATTRMULTI_BY_HANDLE _IOW ('X', 123, struct xfs_fsop_attrmulti_handlereq) +#define XFS_IOC_FSGEOMETRY _IOR ('X', 124, struct xfs_fsop_geom) +#define XFS_IOC_GOINGDOWN _IOR ('X', 125, __uint32_t) +/* XFS_IOC_GETFSUUID ---------- deprecated 140 */ + + +#ifndef HAVE_BBMACROS +/* + * Block I/O parameterization. A basic block (BB) is the lowest size of + * filesystem allocation, and must equal 512. Length units given to bio + * routines are in BB's. + */ +#define BBSHIFT 9 +#define BBSIZE (1<> BBSHIFT) +#define BTOBBT(bytes) ((__u64)(bytes) >> BBSHIFT) +#define BBTOB(bbs) ((bbs) << BBSHIFT) +#endif + +#endif /* __XFS_FS_H__ */ diff --git a/kernel/fs/xfs/libxfs/xfs_ialloc.c b/kernel/fs/xfs/libxfs/xfs_ialloc.c new file mode 100644 index 000000000..1c9e75521 --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_ialloc.c @@ -0,0 +1,2202 @@ +/* + * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_btree.h" +#include "xfs_ialloc.h" +#include "xfs_ialloc_btree.h" +#include "xfs_alloc.h" +#include "xfs_rtalloc.h" +#include "xfs_error.h" +#include "xfs_bmap.h" +#include "xfs_cksum.h" +#include "xfs_trans.h" +#include "xfs_buf_item.h" +#include "xfs_icreate_item.h" +#include "xfs_icache.h" +#include "xfs_trace.h" + + +/* + * Allocation group level functions. + */ +static inline int +xfs_ialloc_cluster_alignment( + struct xfs_mount *mp) +{ + if (xfs_sb_version_hasalign(&mp->m_sb) && + mp->m_sb.sb_inoalignmt >= + XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size)) + return mp->m_sb.sb_inoalignmt; + return 1; +} + +/* + * Lookup a record by ino in the btree given by cur. + */ +int /* error */ +xfs_inobt_lookup( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agino_t ino, /* starting inode of chunk */ + xfs_lookup_t dir, /* <=, >=, == */ + int *stat) /* success/failure */ +{ + cur->bc_rec.i.ir_startino = ino; + cur->bc_rec.i.ir_freecount = 0; + cur->bc_rec.i.ir_free = 0; + return xfs_btree_lookup(cur, dir, stat); +} + +/* + * Update the record referred to by cur to the value given. + * This either works (return 0) or gets an EFSCORRUPTED error. + */ +STATIC int /* error */ +xfs_inobt_update( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_inobt_rec_incore_t *irec) /* btree record */ +{ + union xfs_btree_rec rec; + + rec.inobt.ir_startino = cpu_to_be32(irec->ir_startino); + rec.inobt.ir_freecount = cpu_to_be32(irec->ir_freecount); + rec.inobt.ir_free = cpu_to_be64(irec->ir_free); + return xfs_btree_update(cur, &rec); +} + +/* + * Get the data from the pointed-to record. + */ +int /* error */ +xfs_inobt_get_rec( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_inobt_rec_incore_t *irec, /* btree record */ + int *stat) /* output: success/failure */ +{ + union xfs_btree_rec *rec; + int error; + + error = xfs_btree_get_rec(cur, &rec, stat); + if (!error && *stat == 1) { + irec->ir_startino = be32_to_cpu(rec->inobt.ir_startino); + irec->ir_freecount = be32_to_cpu(rec->inobt.ir_freecount); + irec->ir_free = be64_to_cpu(rec->inobt.ir_free); + } + return error; +} + +/* + * Insert a single inobt record. Cursor must already point to desired location. + */ +STATIC int +xfs_inobt_insert_rec( + struct xfs_btree_cur *cur, + __int32_t freecount, + xfs_inofree_t free, + int *stat) +{ + cur->bc_rec.i.ir_freecount = freecount; + cur->bc_rec.i.ir_free = free; + return xfs_btree_insert(cur, stat); +} + +/* + * Insert records describing a newly allocated inode chunk into the inobt. + */ +STATIC int +xfs_inobt_insert( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_buf *agbp, + xfs_agino_t newino, + xfs_agino_t newlen, + xfs_btnum_t btnum) +{ + struct xfs_btree_cur *cur; + struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); + xfs_agino_t thisino; + int i; + int error; + + cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, btnum); + + for (thisino = newino; + thisino < newino + newlen; + thisino += XFS_INODES_PER_CHUNK) { + error = xfs_inobt_lookup(cur, thisino, XFS_LOOKUP_EQ, &i); + if (error) { + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + return error; + } + ASSERT(i == 0); + + error = xfs_inobt_insert_rec(cur, XFS_INODES_PER_CHUNK, + XFS_INOBT_ALL_FREE, &i); + if (error) { + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + return error; + } + ASSERT(i == 1); + } + + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + + return 0; +} + +/* + * Verify that the number of free inodes in the AGI is correct. + */ +#ifdef DEBUG +STATIC int +xfs_check_agi_freecount( + struct xfs_btree_cur *cur, + struct xfs_agi *agi) +{ + if (cur->bc_nlevels == 1) { + xfs_inobt_rec_incore_t rec; + int freecount = 0; + int error; + int i; + + error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i); + if (error) + return error; + + do { + error = xfs_inobt_get_rec(cur, &rec, &i); + if (error) + return error; + + if (i) { + freecount += rec.ir_freecount; + error = xfs_btree_increment(cur, 0, &i); + if (error) + return error; + } + } while (i == 1); + + if (!XFS_FORCED_SHUTDOWN(cur->bc_mp)) + ASSERT(freecount == be32_to_cpu(agi->agi_freecount)); + } + return 0; +} +#else +#define xfs_check_agi_freecount(cur, agi) 0 +#endif + +/* + * Initialise a new set of inodes. When called without a transaction context + * (e.g. from recovery) we initiate a delayed write of the inode buffers rather + * than logging them (which in a transaction context puts them into the AIL + * for writeback rather than the xfsbufd queue). + */ +int +xfs_ialloc_inode_init( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct list_head *buffer_list, + xfs_agnumber_t agno, + xfs_agblock_t agbno, + xfs_agblock_t length, + unsigned int gen) +{ + struct xfs_buf *fbuf; + struct xfs_dinode *free; + int nbufs, blks_per_cluster, inodes_per_cluster; + int version; + int i, j; + xfs_daddr_t d; + xfs_ino_t ino = 0; + + /* + * Loop over the new block(s), filling in the inodes. For small block + * sizes, manipulate the inodes in buffers which are multiples of the + * blocks size. + */ + blks_per_cluster = xfs_icluster_size_fsb(mp); + inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog; + nbufs = length / blks_per_cluster; + + /* + * Figure out what version number to use in the inodes we create. If + * the superblock version has caught up to the one that supports the new + * inode format, then use the new inode version. Otherwise use the old + * version so that old kernels will continue to be able to use the file + * system. + * + * For v3 inodes, we also need to write the inode number into the inode, + * so calculate the first inode number of the chunk here as + * XFS_OFFBNO_TO_AGINO() only works within a filesystem block, not + * across multiple filesystem blocks (such as a cluster) and so cannot + * be used in the cluster buffer loop below. + * + * Further, because we are writing the inode directly into the buffer + * and calculating a CRC on the entire inode, we have ot log the entire + * inode so that the entire range the CRC covers is present in the log. + * That means for v3 inode we log the entire buffer rather than just the + * inode cores. + */ + if (xfs_sb_version_hascrc(&mp->m_sb)) { + version = 3; + ino = XFS_AGINO_TO_INO(mp, agno, + XFS_OFFBNO_TO_AGINO(mp, agbno, 0)); + + /* + * log the initialisation that is about to take place as an + * logical operation. This means the transaction does not + * need to log the physical changes to the inode buffers as log + * recovery will know what initialisation is actually needed. + * Hence we only need to log the buffers as "ordered" buffers so + * they track in the AIL as if they were physically logged. + */ + if (tp) + xfs_icreate_log(tp, agno, agbno, mp->m_ialloc_inos, + mp->m_sb.sb_inodesize, length, gen); + } else + version = 2; + + for (j = 0; j < nbufs; j++) { + /* + * Get the block. + */ + d = XFS_AGB_TO_DADDR(mp, agno, agbno + (j * blks_per_cluster)); + fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, + mp->m_bsize * blks_per_cluster, + XBF_UNMAPPED); + if (!fbuf) + return -ENOMEM; + + /* Initialize the inode buffers and log them appropriately. */ + fbuf->b_ops = &xfs_inode_buf_ops; + xfs_buf_zero(fbuf, 0, BBTOB(fbuf->b_length)); + for (i = 0; i < inodes_per_cluster; i++) { + int ioffset = i << mp->m_sb.sb_inodelog; + uint isize = xfs_dinode_size(version); + + free = xfs_make_iptr(mp, fbuf, i); + free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC); + free->di_version = version; + free->di_gen = cpu_to_be32(gen); + free->di_next_unlinked = cpu_to_be32(NULLAGINO); + + if (version == 3) { + free->di_ino = cpu_to_be64(ino); + ino++; + uuid_copy(&free->di_uuid, &mp->m_sb.sb_uuid); + xfs_dinode_calc_crc(mp, free); + } else if (tp) { + /* just log the inode core */ + xfs_trans_log_buf(tp, fbuf, ioffset, + ioffset + isize - 1); + } + } + + if (tp) { + /* + * Mark the buffer as an inode allocation buffer so it + * sticks in AIL at the point of this allocation + * transaction. This ensures the they are on disk before + * the tail of the log can be moved past this + * transaction (i.e. by preventing relogging from moving + * it forward in the log). + */ + xfs_trans_inode_alloc_buf(tp, fbuf); + if (version == 3) { + /* + * Mark the buffer as ordered so that they are + * not physically logged in the transaction but + * still tracked in the AIL as part of the + * transaction and pin the log appropriately. + */ + xfs_trans_ordered_buf(tp, fbuf); + xfs_trans_log_buf(tp, fbuf, 0, + BBTOB(fbuf->b_length) - 1); + } + } else { + fbuf->b_flags |= XBF_DONE; + xfs_buf_delwri_queue(fbuf, buffer_list); + xfs_buf_relse(fbuf); + } + } + return 0; +} + +/* + * Allocate new inodes in the allocation group specified by agbp. + * Return 0 for success, else error code. + */ +STATIC int /* error code or 0 */ +xfs_ialloc_ag_alloc( + xfs_trans_t *tp, /* transaction pointer */ + xfs_buf_t *agbp, /* alloc group buffer */ + int *alloc) +{ + xfs_agi_t *agi; /* allocation group header */ + xfs_alloc_arg_t args; /* allocation argument structure */ + xfs_agnumber_t agno; + int error; + xfs_agino_t newino; /* new first inode's number */ + xfs_agino_t newlen; /* new number of inodes */ + int isaligned = 0; /* inode allocation at stripe unit */ + /* boundary */ + struct xfs_perag *pag; + + memset(&args, 0, sizeof(args)); + args.tp = tp; + args.mp = tp->t_mountp; + + /* + * Locking will ensure that we don't have two callers in here + * at one time. + */ + newlen = args.mp->m_ialloc_inos; + if (args.mp->m_maxicount && + percpu_counter_read_positive(&args.mp->m_icount) + newlen > + args.mp->m_maxicount) + return -ENOSPC; + args.minlen = args.maxlen = args.mp->m_ialloc_blks; + /* + * First try to allocate inodes contiguous with the last-allocated + * chunk of inodes. If the filesystem is striped, this will fill + * an entire stripe unit with inodes. + */ + agi = XFS_BUF_TO_AGI(agbp); + newino = be32_to_cpu(agi->agi_newino); + agno = be32_to_cpu(agi->agi_seqno); + args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) + + args.mp->m_ialloc_blks; + if (likely(newino != NULLAGINO && + (args.agbno < be32_to_cpu(agi->agi_length)))) { + args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno); + args.type = XFS_ALLOCTYPE_THIS_BNO; + args.prod = 1; + + /* + * We need to take into account alignment here to ensure that + * we don't modify the free list if we fail to have an exact + * block. If we don't have an exact match, and every oher + * attempt allocation attempt fails, we'll end up cancelling + * a dirty transaction and shutting down. + * + * For an exact allocation, alignment must be 1, + * however we need to take cluster alignment into account when + * fixing up the freelist. Use the minalignslop field to + * indicate that extra blocks might be required for alignment, + * but not to use them in the actual exact allocation. + */ + args.alignment = 1; + args.minalignslop = xfs_ialloc_cluster_alignment(args.mp) - 1; + + /* Allow space for the inode btree to split. */ + args.minleft = args.mp->m_in_maxlevels - 1; + if ((error = xfs_alloc_vextent(&args))) + return error; + + /* + * This request might have dirtied the transaction if the AG can + * satisfy the request, but the exact block was not available. + * If the allocation did fail, subsequent requests will relax + * the exact agbno requirement and increase the alignment + * instead. It is critical that the total size of the request + * (len + alignment + slop) does not increase from this point + * on, so reset minalignslop to ensure it is not included in + * subsequent requests. + */ + args.minalignslop = 0; + } else + args.fsbno = NULLFSBLOCK; + + if (unlikely(args.fsbno == NULLFSBLOCK)) { + /* + * Set the alignment for the allocation. + * If stripe alignment is turned on then align at stripe unit + * boundary. + * If the cluster size is smaller than a filesystem block + * then we're doing I/O for inodes in filesystem block size + * pieces, so don't need alignment anyway. + */ + isaligned = 0; + if (args.mp->m_sinoalign) { + ASSERT(!(args.mp->m_flags & XFS_MOUNT_NOALIGN)); + args.alignment = args.mp->m_dalign; + isaligned = 1; + } else + args.alignment = xfs_ialloc_cluster_alignment(args.mp); + /* + * Need to figure out where to allocate the inode blocks. + * Ideally they should be spaced out through the a.g. + * For now, just allocate blocks up front. + */ + args.agbno = be32_to_cpu(agi->agi_root); + args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno); + /* + * Allocate a fixed-size extent of inodes. + */ + args.type = XFS_ALLOCTYPE_NEAR_BNO; + args.prod = 1; + /* + * Allow space for the inode btree to split. + */ + args.minleft = args.mp->m_in_maxlevels - 1; + if ((error = xfs_alloc_vextent(&args))) + return error; + } + + /* + * If stripe alignment is turned on, then try again with cluster + * alignment. + */ + if (isaligned && args.fsbno == NULLFSBLOCK) { + args.type = XFS_ALLOCTYPE_NEAR_BNO; + args.agbno = be32_to_cpu(agi->agi_root); + args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno); + args.alignment = xfs_ialloc_cluster_alignment(args.mp); + if ((error = xfs_alloc_vextent(&args))) + return error; + } + + if (args.fsbno == NULLFSBLOCK) { + *alloc = 0; + return 0; + } + ASSERT(args.len == args.minlen); + + /* + * Stamp and write the inode buffers. + * + * Seed the new inode cluster with a random generation number. This + * prevents short-term reuse of generation numbers if a chunk is + * freed and then immediately reallocated. We use random numbers + * rather than a linear progression to prevent the next generation + * number from being easily guessable. + */ + error = xfs_ialloc_inode_init(args.mp, tp, NULL, agno, args.agbno, + args.len, prandom_u32()); + + if (error) + return error; + /* + * Convert the results. + */ + newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0); + be32_add_cpu(&agi->agi_count, newlen); + be32_add_cpu(&agi->agi_freecount, newlen); + pag = xfs_perag_get(args.mp, agno); + pag->pagi_freecount += newlen; + xfs_perag_put(pag); + agi->agi_newino = cpu_to_be32(newino); + + /* + * Insert records describing the new inode chunk into the btrees. + */ + error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen, + XFS_BTNUM_INO); + if (error) + return error; + + if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) { + error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen, + XFS_BTNUM_FINO); + if (error) + return error; + } + /* + * Log allocation group header fields + */ + xfs_ialloc_log_agi(tp, agbp, + XFS_AGI_COUNT | XFS_AGI_FREECOUNT | XFS_AGI_NEWINO); + /* + * Modify/log superblock values for inode count and inode free count. + */ + xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, (long)newlen); + xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, (long)newlen); + *alloc = 1; + return 0; +} + +STATIC xfs_agnumber_t +xfs_ialloc_next_ag( + xfs_mount_t *mp) +{ + xfs_agnumber_t agno; + + spin_lock(&mp->m_agirotor_lock); + agno = mp->m_agirotor; + if (++mp->m_agirotor >= mp->m_maxagi) + mp->m_agirotor = 0; + spin_unlock(&mp->m_agirotor_lock); + + return agno; +} + +/* + * Select an allocation group to look for a free inode in, based on the parent + * inode and the mode. Return the allocation group buffer. + */ +STATIC xfs_agnumber_t +xfs_ialloc_ag_select( + xfs_trans_t *tp, /* transaction pointer */ + xfs_ino_t parent, /* parent directory inode number */ + umode_t mode, /* bits set to indicate file type */ + int okalloc) /* ok to allocate more space */ +{ + xfs_agnumber_t agcount; /* number of ag's in the filesystem */ + xfs_agnumber_t agno; /* current ag number */ + int flags; /* alloc buffer locking flags */ + xfs_extlen_t ineed; /* blocks needed for inode allocation */ + xfs_extlen_t longest = 0; /* longest extent available */ + xfs_mount_t *mp; /* mount point structure */ + int needspace; /* file mode implies space allocated */ + xfs_perag_t *pag; /* per allocation group data */ + xfs_agnumber_t pagno; /* parent (starting) ag number */ + int error; + + /* + * Files of these types need at least one block if length > 0 + * (and they won't fit in the inode, but that's hard to figure out). + */ + needspace = S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode); + mp = tp->t_mountp; + agcount = mp->m_maxagi; + if (S_ISDIR(mode)) + pagno = xfs_ialloc_next_ag(mp); + else { + pagno = XFS_INO_TO_AGNO(mp, parent); + if (pagno >= agcount) + pagno = 0; + } + + ASSERT(pagno < agcount); + + /* + * Loop through allocation groups, looking for one with a little + * free space in it. Note we don't look for free inodes, exactly. + * Instead, we include whether there is a need to allocate inodes + * to mean that blocks must be allocated for them, + * if none are currently free. + */ + agno = pagno; + flags = XFS_ALLOC_FLAG_TRYLOCK; + for (;;) { + pag = xfs_perag_get(mp, agno); + if (!pag->pagi_inodeok) { + xfs_ialloc_next_ag(mp); + goto nextag; + } + + if (!pag->pagi_init) { + error = xfs_ialloc_pagi_init(mp, tp, agno); + if (error) + goto nextag; + } + + if (pag->pagi_freecount) { + xfs_perag_put(pag); + return agno; + } + + if (!okalloc) + goto nextag; + + if (!pag->pagf_init) { + error = xfs_alloc_pagf_init(mp, tp, agno, flags); + if (error) + goto nextag; + } + + /* + * Check that there is enough free space for the file plus a + * chunk of inodes if we need to allocate some. If this is the + * first pass across the AGs, take into account the potential + * space needed for alignment of inode chunks when checking the + * longest contiguous free space in the AG - this prevents us + * from getting ENOSPC because we have free space larger than + * m_ialloc_blks but alignment constraints prevent us from using + * it. + * + * If we can't find an AG with space for full alignment slack to + * be taken into account, we must be near ENOSPC in all AGs. + * Hence we don't include alignment for the second pass and so + * if we fail allocation due to alignment issues then it is most + * likely a real ENOSPC condition. + */ + ineed = mp->m_ialloc_blks; + if (flags && ineed > 1) + ineed += xfs_ialloc_cluster_alignment(mp); + longest = pag->pagf_longest; + if (!longest) + longest = pag->pagf_flcount > 0; + + if (pag->pagf_freeblks >= needspace + ineed && + longest >= ineed) { + xfs_perag_put(pag); + return agno; + } +nextag: + xfs_perag_put(pag); + /* + * No point in iterating over the rest, if we're shutting + * down. + */ + if (XFS_FORCED_SHUTDOWN(mp)) + return NULLAGNUMBER; + agno++; + if (agno >= agcount) + agno = 0; + if (agno == pagno) { + if (flags == 0) + return NULLAGNUMBER; + flags = 0; + } + } +} + +/* + * Try to retrieve the next record to the left/right from the current one. + */ +STATIC int +xfs_ialloc_next_rec( + struct xfs_btree_cur *cur, + xfs_inobt_rec_incore_t *rec, + int *done, + int left) +{ + int error; + int i; + + if (left) + error = xfs_btree_decrement(cur, 0, &i); + else + error = xfs_btree_increment(cur, 0, &i); + + if (error) + return error; + *done = !i; + if (i) { + error = xfs_inobt_get_rec(cur, rec, &i); + if (error) + return error; + XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1); + } + + return 0; +} + +STATIC int +xfs_ialloc_get_rec( + struct xfs_btree_cur *cur, + xfs_agino_t agino, + xfs_inobt_rec_incore_t *rec, + int *done) +{ + int error; + int i; + + error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_EQ, &i); + if (error) + return error; + *done = !i; + if (i) { + error = xfs_inobt_get_rec(cur, rec, &i); + if (error) + return error; + XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1); + } + + return 0; +} + +/* + * Allocate an inode using the inobt-only algorithm. + */ +STATIC int +xfs_dialloc_ag_inobt( + struct xfs_trans *tp, + struct xfs_buf *agbp, + xfs_ino_t parent, + xfs_ino_t *inop) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); + xfs_agnumber_t pagno = XFS_INO_TO_AGNO(mp, parent); + xfs_agino_t pagino = XFS_INO_TO_AGINO(mp, parent); + struct xfs_perag *pag; + struct xfs_btree_cur *cur, *tcur; + struct xfs_inobt_rec_incore rec, trec; + xfs_ino_t ino; + int error; + int offset; + int i, j; + + pag = xfs_perag_get(mp, agno); + + ASSERT(pag->pagi_init); + ASSERT(pag->pagi_inodeok); + ASSERT(pag->pagi_freecount > 0); + + restart_pagno: + cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO); + /* + * If pagino is 0 (this is the root inode allocation) use newino. + * This must work because we've just allocated some. + */ + if (!pagino) + pagino = be32_to_cpu(agi->agi_newino); + + error = xfs_check_agi_freecount(cur, agi); + if (error) + goto error0; + + /* + * If in the same AG as the parent, try to get near the parent. + */ + if (pagno == agno) { + int doneleft; /* done, to the left */ + int doneright; /* done, to the right */ + int searchdistance = 10; + + error = xfs_inobt_lookup(cur, pagino, XFS_LOOKUP_LE, &i); + if (error) + goto error0; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + + error = xfs_inobt_get_rec(cur, &rec, &j); + if (error) + goto error0; + XFS_WANT_CORRUPTED_GOTO(mp, j == 1, error0); + + if (rec.ir_freecount > 0) { + /* + * Found a free inode in the same chunk + * as the parent, done. + */ + goto alloc_inode; + } + + + /* + * In the same AG as parent, but parent's chunk is full. + */ + + /* duplicate the cursor, search left & right simultaneously */ + error = xfs_btree_dup_cursor(cur, &tcur); + if (error) + goto error0; + + /* + * Skip to last blocks looked up if same parent inode. + */ + if (pagino != NULLAGINO && + pag->pagl_pagino == pagino && + pag->pagl_leftrec != NULLAGINO && + pag->pagl_rightrec != NULLAGINO) { + error = xfs_ialloc_get_rec(tcur, pag->pagl_leftrec, + &trec, &doneleft); + if (error) + goto error1; + + error = xfs_ialloc_get_rec(cur, pag->pagl_rightrec, + &rec, &doneright); + if (error) + goto error1; + } else { + /* search left with tcur, back up 1 record */ + error = xfs_ialloc_next_rec(tcur, &trec, &doneleft, 1); + if (error) + goto error1; + + /* search right with cur, go forward 1 record. */ + error = xfs_ialloc_next_rec(cur, &rec, &doneright, 0); + if (error) + goto error1; + } + + /* + * Loop until we find an inode chunk with a free inode. + */ + while (!doneleft || !doneright) { + int useleft; /* using left inode chunk this time */ + + if (!--searchdistance) { + /* + * Not in range - save last search + * location and allocate a new inode + */ + xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); + pag->pagl_leftrec = trec.ir_startino; + pag->pagl_rightrec = rec.ir_startino; + pag->pagl_pagino = pagino; + goto newino; + } + + /* figure out the closer block if both are valid. */ + if (!doneleft && !doneright) { + useleft = pagino - + (trec.ir_startino + XFS_INODES_PER_CHUNK - 1) < + rec.ir_startino - pagino; + } else { + useleft = !doneleft; + } + + /* free inodes to the left? */ + if (useleft && trec.ir_freecount) { + rec = trec; + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + cur = tcur; + + pag->pagl_leftrec = trec.ir_startino; + pag->pagl_rightrec = rec.ir_startino; + pag->pagl_pagino = pagino; + goto alloc_inode; + } + + /* free inodes to the right? */ + if (!useleft && rec.ir_freecount) { + xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); + + pag->pagl_leftrec = trec.ir_startino; + pag->pagl_rightrec = rec.ir_startino; + pag->pagl_pagino = pagino; + goto alloc_inode; + } + + /* get next record to check */ + if (useleft) { + error = xfs_ialloc_next_rec(tcur, &trec, + &doneleft, 1); + } else { + error = xfs_ialloc_next_rec(cur, &rec, + &doneright, 0); + } + if (error) + goto error1; + } + + /* + * We've reached the end of the btree. because + * we are only searching a small chunk of the + * btree each search, there is obviously free + * inodes closer to the parent inode than we + * are now. restart the search again. + */ + pag->pagl_pagino = NULLAGINO; + pag->pagl_leftrec = NULLAGINO; + pag->pagl_rightrec = NULLAGINO; + xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + goto restart_pagno; + } + + /* + * In a different AG from the parent. + * See if the most recently allocated block has any free. + */ +newino: + if (agi->agi_newino != cpu_to_be32(NULLAGINO)) { + error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino), + XFS_LOOKUP_EQ, &i); + if (error) + goto error0; + + if (i == 1) { + error = xfs_inobt_get_rec(cur, &rec, &j); + if (error) + goto error0; + + if (j == 1 && rec.ir_freecount > 0) { + /* + * The last chunk allocated in the group + * still has a free inode. + */ + goto alloc_inode; + } + } + } + + /* + * None left in the last group, search the whole AG + */ + error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i); + if (error) + goto error0; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + + for (;;) { + error = xfs_inobt_get_rec(cur, &rec, &i); + if (error) + goto error0; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + if (rec.ir_freecount > 0) + break; + error = xfs_btree_increment(cur, 0, &i); + if (error) + goto error0; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + } + +alloc_inode: + offset = xfs_lowbit64(rec.ir_free); + ASSERT(offset >= 0); + ASSERT(offset < XFS_INODES_PER_CHUNK); + ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) % + XFS_INODES_PER_CHUNK) == 0); + ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset); + rec.ir_free &= ~XFS_INOBT_MASK(offset); + rec.ir_freecount--; + error = xfs_inobt_update(cur, &rec); + if (error) + goto error0; + be32_add_cpu(&agi->agi_freecount, -1); + xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT); + pag->pagi_freecount--; + + error = xfs_check_agi_freecount(cur, agi); + if (error) + goto error0; + + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1); + xfs_perag_put(pag); + *inop = ino; + return 0; +error1: + xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); +error0: + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + xfs_perag_put(pag); + return error; +} + +/* + * Use the free inode btree to allocate an inode based on distance from the + * parent. Note that the provided cursor may be deleted and replaced. + */ +STATIC int +xfs_dialloc_ag_finobt_near( + xfs_agino_t pagino, + struct xfs_btree_cur **ocur, + struct xfs_inobt_rec_incore *rec) +{ + struct xfs_btree_cur *lcur = *ocur; /* left search cursor */ + struct xfs_btree_cur *rcur; /* right search cursor */ + struct xfs_inobt_rec_incore rrec; + int error; + int i, j; + + error = xfs_inobt_lookup(lcur, pagino, XFS_LOOKUP_LE, &i); + if (error) + return error; + + if (i == 1) { + error = xfs_inobt_get_rec(lcur, rec, &i); + if (error) + return error; + XFS_WANT_CORRUPTED_RETURN(lcur->bc_mp, i == 1); + + /* + * See if we've landed in the parent inode record. The finobt + * only tracks chunks with at least one free inode, so record + * existence is enough. + */ + if (pagino >= rec->ir_startino && + pagino < (rec->ir_startino + XFS_INODES_PER_CHUNK)) + return 0; + } + + error = xfs_btree_dup_cursor(lcur, &rcur); + if (error) + return error; + + error = xfs_inobt_lookup(rcur, pagino, XFS_LOOKUP_GE, &j); + if (error) + goto error_rcur; + if (j == 1) { + error = xfs_inobt_get_rec(rcur, &rrec, &j); + if (error) + goto error_rcur; + XFS_WANT_CORRUPTED_GOTO(lcur->bc_mp, j == 1, error_rcur); + } + + XFS_WANT_CORRUPTED_GOTO(lcur->bc_mp, i == 1 || j == 1, error_rcur); + if (i == 1 && j == 1) { + /* + * Both the left and right records are valid. Choose the closer + * inode chunk to the target. + */ + if ((pagino - rec->ir_startino + XFS_INODES_PER_CHUNK - 1) > + (rrec.ir_startino - pagino)) { + *rec = rrec; + xfs_btree_del_cursor(lcur, XFS_BTREE_NOERROR); + *ocur = rcur; + } else { + xfs_btree_del_cursor(rcur, XFS_BTREE_NOERROR); + } + } else if (j == 1) { + /* only the right record is valid */ + *rec = rrec; + xfs_btree_del_cursor(lcur, XFS_BTREE_NOERROR); + *ocur = rcur; + } else if (i == 1) { + /* only the left record is valid */ + xfs_btree_del_cursor(rcur, XFS_BTREE_NOERROR); + } + + return 0; + +error_rcur: + xfs_btree_del_cursor(rcur, XFS_BTREE_ERROR); + return error; +} + +/* + * Use the free inode btree to find a free inode based on a newino hint. If + * the hint is NULL, find the first free inode in the AG. + */ +STATIC int +xfs_dialloc_ag_finobt_newino( + struct xfs_agi *agi, + struct xfs_btree_cur *cur, + struct xfs_inobt_rec_incore *rec) +{ + int error; + int i; + + if (agi->agi_newino != cpu_to_be32(NULLAGINO)) { + error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino), + XFS_LOOKUP_EQ, &i); + if (error) + return error; + if (i == 1) { + error = xfs_inobt_get_rec(cur, rec, &i); + if (error) + return error; + XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1); + return 0; + } + } + + /* + * Find the first inode available in the AG. + */ + error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i); + if (error) + return error; + XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1); + + error = xfs_inobt_get_rec(cur, rec, &i); + if (error) + return error; + XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1); + + return 0; +} + +/* + * Update the inobt based on a modification made to the finobt. Also ensure that + * the records from both trees are equivalent post-modification. + */ +STATIC int +xfs_dialloc_ag_update_inobt( + struct xfs_btree_cur *cur, /* inobt cursor */ + struct xfs_inobt_rec_incore *frec, /* finobt record */ + int offset) /* inode offset */ +{ + struct xfs_inobt_rec_incore rec; + int error; + int i; + + error = xfs_inobt_lookup(cur, frec->ir_startino, XFS_LOOKUP_EQ, &i); + if (error) + return error; + XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1); + + error = xfs_inobt_get_rec(cur, &rec, &i); + if (error) + return error; + XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1); + ASSERT((XFS_AGINO_TO_OFFSET(cur->bc_mp, rec.ir_startino) % + XFS_INODES_PER_CHUNK) == 0); + + rec.ir_free &= ~XFS_INOBT_MASK(offset); + rec.ir_freecount--; + + XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, (rec.ir_free == frec->ir_free) && + (rec.ir_freecount == frec->ir_freecount)); + + return xfs_inobt_update(cur, &rec); +} + +/* + * Allocate an inode using the free inode btree, if available. Otherwise, fall + * back to the inobt search algorithm. + * + * The caller selected an AG for us, and made sure that free inodes are + * available. + */ +STATIC int +xfs_dialloc_ag( + struct xfs_trans *tp, + struct xfs_buf *agbp, + xfs_ino_t parent, + xfs_ino_t *inop) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); + xfs_agnumber_t pagno = XFS_INO_TO_AGNO(mp, parent); + xfs_agino_t pagino = XFS_INO_TO_AGINO(mp, parent); + struct xfs_perag *pag; + struct xfs_btree_cur *cur; /* finobt cursor */ + struct xfs_btree_cur *icur; /* inobt cursor */ + struct xfs_inobt_rec_incore rec; + xfs_ino_t ino; + int error; + int offset; + int i; + + if (!xfs_sb_version_hasfinobt(&mp->m_sb)) + return xfs_dialloc_ag_inobt(tp, agbp, parent, inop); + + pag = xfs_perag_get(mp, agno); + + /* + * If pagino is 0 (this is the root inode allocation) use newino. + * This must work because we've just allocated some. + */ + if (!pagino) + pagino = be32_to_cpu(agi->agi_newino); + + cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_FINO); + + error = xfs_check_agi_freecount(cur, agi); + if (error) + goto error_cur; + + /* + * The search algorithm depends on whether we're in the same AG as the + * parent. If so, find the closest available inode to the parent. If + * not, consider the agi hint or find the first free inode in the AG. + */ + if (agno == pagno) + error = xfs_dialloc_ag_finobt_near(pagino, &cur, &rec); + else + error = xfs_dialloc_ag_finobt_newino(agi, cur, &rec); + if (error) + goto error_cur; + + offset = xfs_lowbit64(rec.ir_free); + ASSERT(offset >= 0); + ASSERT(offset < XFS_INODES_PER_CHUNK); + ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) % + XFS_INODES_PER_CHUNK) == 0); + ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset); + + /* + * Modify or remove the finobt record. + */ + rec.ir_free &= ~XFS_INOBT_MASK(offset); + rec.ir_freecount--; + if (rec.ir_freecount) + error = xfs_inobt_update(cur, &rec); + else + error = xfs_btree_delete(cur, &i); + if (error) + goto error_cur; + + /* + * The finobt has now been updated appropriately. We haven't updated the + * agi and superblock yet, so we can create an inobt cursor and validate + * the original freecount. If all is well, make the equivalent update to + * the inobt using the finobt record and offset information. + */ + icur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO); + + error = xfs_check_agi_freecount(icur, agi); + if (error) + goto error_icur; + + error = xfs_dialloc_ag_update_inobt(icur, &rec, offset); + if (error) + goto error_icur; + + /* + * Both trees have now been updated. We must update the perag and + * superblock before we can check the freecount for each btree. + */ + be32_add_cpu(&agi->agi_freecount, -1); + xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT); + pag->pagi_freecount--; + + xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1); + + error = xfs_check_agi_freecount(icur, agi); + if (error) + goto error_icur; + error = xfs_check_agi_freecount(cur, agi); + if (error) + goto error_icur; + + xfs_btree_del_cursor(icur, XFS_BTREE_NOERROR); + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + xfs_perag_put(pag); + *inop = ino; + return 0; + +error_icur: + xfs_btree_del_cursor(icur, XFS_BTREE_ERROR); +error_cur: + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + xfs_perag_put(pag); + return error; +} + +/* + * Allocate an inode on disk. + * + * Mode is used to tell whether the new inode will need space, and whether it + * is a directory. + * + * This function is designed to be called twice if it has to do an allocation + * to make more free inodes. On the first call, *IO_agbp should be set to NULL. + * If an inode is available without having to performn an allocation, an inode + * number is returned. In this case, *IO_agbp is set to NULL. If an allocation + * needs to be done, xfs_dialloc returns the current AGI buffer in *IO_agbp. + * The caller should then commit the current transaction, allocate a + * new transaction, and call xfs_dialloc() again, passing in the previous value + * of *IO_agbp. IO_agbp should be held across the transactions. Since the AGI + * buffer is locked across the two calls, the second call is guaranteed to have + * a free inode available. + * + * Once we successfully pick an inode its number is returned and the on-disk + * data structures are updated. The inode itself is not read in, since doing so + * would break ordering constraints with xfs_reclaim. + */ +int +xfs_dialloc( + struct xfs_trans *tp, + xfs_ino_t parent, + umode_t mode, + int okalloc, + struct xfs_buf **IO_agbp, + xfs_ino_t *inop) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_buf *agbp; + xfs_agnumber_t agno; + int error; + int ialloced; + int noroom = 0; + xfs_agnumber_t start_agno; + struct xfs_perag *pag; + + if (*IO_agbp) { + /* + * If the caller passes in a pointer to the AGI buffer, + * continue where we left off before. In this case, we + * know that the allocation group has free inodes. + */ + agbp = *IO_agbp; + goto out_alloc; + } + + /* + * We do not have an agbp, so select an initial allocation + * group for inode allocation. + */ + start_agno = xfs_ialloc_ag_select(tp, parent, mode, okalloc); + if (start_agno == NULLAGNUMBER) { + *inop = NULLFSINO; + return 0; + } + + /* + * If we have already hit the ceiling of inode blocks then clear + * okalloc so we scan all available agi structures for a free + * inode. + * + * Read rough value of mp->m_icount by percpu_counter_read_positive, + * which will sacrifice the preciseness but improve the performance. + */ + if (mp->m_maxicount && + percpu_counter_read_positive(&mp->m_icount) + mp->m_ialloc_inos + > mp->m_maxicount) { + noroom = 1; + okalloc = 0; + } + + /* + * Loop until we find an allocation group that either has free inodes + * or in which we can allocate some inodes. Iterate through the + * allocation groups upward, wrapping at the end. + */ + agno = start_agno; + for (;;) { + pag = xfs_perag_get(mp, agno); + if (!pag->pagi_inodeok) { + xfs_ialloc_next_ag(mp); + goto nextag; + } + + if (!pag->pagi_init) { + error = xfs_ialloc_pagi_init(mp, tp, agno); + if (error) + goto out_error; + } + + /* + * Do a first racy fast path check if this AG is usable. + */ + if (!pag->pagi_freecount && !okalloc) + goto nextag; + + /* + * Then read in the AGI buffer and recheck with the AGI buffer + * lock held. + */ + error = xfs_ialloc_read_agi(mp, tp, agno, &agbp); + if (error) + goto out_error; + + if (pag->pagi_freecount) { + xfs_perag_put(pag); + goto out_alloc; + } + + if (!okalloc) + goto nextag_relse_buffer; + + + error = xfs_ialloc_ag_alloc(tp, agbp, &ialloced); + if (error) { + xfs_trans_brelse(tp, agbp); + + if (error != -ENOSPC) + goto out_error; + + xfs_perag_put(pag); + *inop = NULLFSINO; + return 0; + } + + if (ialloced) { + /* + * We successfully allocated some inodes, return + * the current context to the caller so that it + * can commit the current transaction and call + * us again where we left off. + */ + ASSERT(pag->pagi_freecount > 0); + xfs_perag_put(pag); + + *IO_agbp = agbp; + *inop = NULLFSINO; + return 0; + } + +nextag_relse_buffer: + xfs_trans_brelse(tp, agbp); +nextag: + xfs_perag_put(pag); + if (++agno == mp->m_sb.sb_agcount) + agno = 0; + if (agno == start_agno) { + *inop = NULLFSINO; + return noroom ? -ENOSPC : 0; + } + } + +out_alloc: + *IO_agbp = NULL; + return xfs_dialloc_ag(tp, agbp, parent, inop); +out_error: + xfs_perag_put(pag); + return error; +} + +STATIC int +xfs_difree_inobt( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_buf *agbp, + xfs_agino_t agino, + struct xfs_bmap_free *flist, + int *deleted, + xfs_ino_t *first_ino, + struct xfs_inobt_rec_incore *orec) +{ + struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); + struct xfs_perag *pag; + struct xfs_btree_cur *cur; + struct xfs_inobt_rec_incore rec; + int ilen; + int error; + int i; + int off; + + ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC)); + ASSERT(XFS_AGINO_TO_AGBNO(mp, agino) < be32_to_cpu(agi->agi_length)); + + /* + * Initialize the cursor. + */ + cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO); + + error = xfs_check_agi_freecount(cur, agi); + if (error) + goto error0; + + /* + * Look for the entry describing this inode. + */ + if ((error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i))) { + xfs_warn(mp, "%s: xfs_inobt_lookup() returned error %d.", + __func__, error); + goto error0; + } + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + error = xfs_inobt_get_rec(cur, &rec, &i); + if (error) { + xfs_warn(mp, "%s: xfs_inobt_get_rec() returned error %d.", + __func__, error); + goto error0; + } + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); + /* + * Get the offset in the inode chunk. + */ + off = agino - rec.ir_startino; + ASSERT(off >= 0 && off < XFS_INODES_PER_CHUNK); + ASSERT(!(rec.ir_free & XFS_INOBT_MASK(off))); + /* + * Mark the inode free & increment the count. + */ + rec.ir_free |= XFS_INOBT_MASK(off); + rec.ir_freecount++; + + /* + * When an inode cluster is free, it becomes eligible for removal + */ + if (!(mp->m_flags & XFS_MOUNT_IKEEP) && + (rec.ir_freecount == mp->m_ialloc_inos)) { + + *deleted = 1; + *first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino); + + /* + * Remove the inode cluster from the AGI B+Tree, adjust the + * AGI and Superblock inode counts, and mark the disk space + * to be freed when the transaction is committed. + */ + ilen = mp->m_ialloc_inos; + be32_add_cpu(&agi->agi_count, -ilen); + be32_add_cpu(&agi->agi_freecount, -(ilen - 1)); + xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT); + pag = xfs_perag_get(mp, agno); + pag->pagi_freecount -= ilen - 1; + xfs_perag_put(pag); + xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, -ilen); + xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -(ilen - 1)); + + if ((error = xfs_btree_delete(cur, &i))) { + xfs_warn(mp, "%s: xfs_btree_delete returned error %d.", + __func__, error); + goto error0; + } + + xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno, + XFS_AGINO_TO_AGBNO(mp, rec.ir_startino)), + mp->m_ialloc_blks, flist, mp); + } else { + *deleted = 0; + + error = xfs_inobt_update(cur, &rec); + if (error) { + xfs_warn(mp, "%s: xfs_inobt_update returned error %d.", + __func__, error); + goto error0; + } + + /* + * Change the inode free counts and log the ag/sb changes. + */ + be32_add_cpu(&agi->agi_freecount, 1); + xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT); + pag = xfs_perag_get(mp, agno); + pag->pagi_freecount++; + xfs_perag_put(pag); + xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1); + } + + error = xfs_check_agi_freecount(cur, agi); + if (error) + goto error0; + + *orec = rec; + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + return 0; + +error0: + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + return error; +} + +/* + * Free an inode in the free inode btree. + */ +STATIC int +xfs_difree_finobt( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_buf *agbp, + xfs_agino_t agino, + struct xfs_inobt_rec_incore *ibtrec) /* inobt record */ +{ + struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); + struct xfs_btree_cur *cur; + struct xfs_inobt_rec_incore rec; + int offset = agino - ibtrec->ir_startino; + int error; + int i; + + cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_FINO); + + error = xfs_inobt_lookup(cur, ibtrec->ir_startino, XFS_LOOKUP_EQ, &i); + if (error) + goto error; + if (i == 0) { + /* + * If the record does not exist in the finobt, we must have just + * freed an inode in a previously fully allocated chunk. If not, + * something is out of sync. + */ + XFS_WANT_CORRUPTED_GOTO(mp, ibtrec->ir_freecount == 1, error); + + error = xfs_inobt_insert_rec(cur, ibtrec->ir_freecount, + ibtrec->ir_free, &i); + if (error) + goto error; + ASSERT(i == 1); + + goto out; + } + + /* + * Read and update the existing record. We could just copy the ibtrec + * across here, but that would defeat the purpose of having redundant + * metadata. By making the modifications independently, we can catch + * corruptions that we wouldn't see if we just copied from one record + * to another. + */ + error = xfs_inobt_get_rec(cur, &rec, &i); + if (error) + goto error; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error); + + rec.ir_free |= XFS_INOBT_MASK(offset); + rec.ir_freecount++; + + XFS_WANT_CORRUPTED_GOTO(mp, (rec.ir_free == ibtrec->ir_free) && + (rec.ir_freecount == ibtrec->ir_freecount), + error); + + /* + * The content of inobt records should always match between the inobt + * and finobt. The lifecycle of records in the finobt is different from + * the inobt in that the finobt only tracks records with at least one + * free inode. Hence, if all of the inodes are free and we aren't + * keeping inode chunks permanently on disk, remove the record. + * Otherwise, update the record with the new information. + */ + if (rec.ir_freecount == mp->m_ialloc_inos && + !(mp->m_flags & XFS_MOUNT_IKEEP)) { + error = xfs_btree_delete(cur, &i); + if (error) + goto error; + ASSERT(i == 1); + } else { + error = xfs_inobt_update(cur, &rec); + if (error) + goto error; + } + +out: + error = xfs_check_agi_freecount(cur, agi); + if (error) + goto error; + + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + return 0; + +error: + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + return error; +} + +/* + * Free disk inode. Carefully avoids touching the incore inode, all + * manipulations incore are the caller's responsibility. + * The on-disk inode is not changed by this operation, only the + * btree (free inode mask) is changed. + */ +int +xfs_difree( + struct xfs_trans *tp, /* transaction pointer */ + xfs_ino_t inode, /* inode to be freed */ + struct xfs_bmap_free *flist, /* extents to free */ + int *deleted,/* set if inode cluster was deleted */ + xfs_ino_t *first_ino)/* first inode in deleted cluster */ +{ + /* REFERENCED */ + xfs_agblock_t agbno; /* block number containing inode */ + struct xfs_buf *agbp; /* buffer for allocation group header */ + xfs_agino_t agino; /* allocation group inode number */ + xfs_agnumber_t agno; /* allocation group number */ + int error; /* error return value */ + struct xfs_mount *mp; /* mount structure for filesystem */ + struct xfs_inobt_rec_incore rec;/* btree record */ + + mp = tp->t_mountp; + + /* + * Break up inode number into its components. + */ + agno = XFS_INO_TO_AGNO(mp, inode); + if (agno >= mp->m_sb.sb_agcount) { + xfs_warn(mp, "%s: agno >= mp->m_sb.sb_agcount (%d >= %d).", + __func__, agno, mp->m_sb.sb_agcount); + ASSERT(0); + return -EINVAL; + } + agino = XFS_INO_TO_AGINO(mp, inode); + if (inode != XFS_AGINO_TO_INO(mp, agno, agino)) { + xfs_warn(mp, "%s: inode != XFS_AGINO_TO_INO() (%llu != %llu).", + __func__, (unsigned long long)inode, + (unsigned long long)XFS_AGINO_TO_INO(mp, agno, agino)); + ASSERT(0); + return -EINVAL; + } + agbno = XFS_AGINO_TO_AGBNO(mp, agino); + if (agbno >= mp->m_sb.sb_agblocks) { + xfs_warn(mp, "%s: agbno >= mp->m_sb.sb_agblocks (%d >= %d).", + __func__, agbno, mp->m_sb.sb_agblocks); + ASSERT(0); + return -EINVAL; + } + /* + * Get the allocation group header. + */ + error = xfs_ialloc_read_agi(mp, tp, agno, &agbp); + if (error) { + xfs_warn(mp, "%s: xfs_ialloc_read_agi() returned error %d.", + __func__, error); + return error; + } + + /* + * Fix up the inode allocation btree. + */ + error = xfs_difree_inobt(mp, tp, agbp, agino, flist, deleted, first_ino, + &rec); + if (error) + goto error0; + + /* + * Fix up the free inode btree. + */ + if (xfs_sb_version_hasfinobt(&mp->m_sb)) { + error = xfs_difree_finobt(mp, tp, agbp, agino, &rec); + if (error) + goto error0; + } + + return 0; + +error0: + return error; +} + +STATIC int +xfs_imap_lookup( + struct xfs_mount *mp, + struct xfs_trans *tp, + xfs_agnumber_t agno, + xfs_agino_t agino, + xfs_agblock_t agbno, + xfs_agblock_t *chunk_agbno, + xfs_agblock_t *offset_agbno, + int flags) +{ + struct xfs_inobt_rec_incore rec; + struct xfs_btree_cur *cur; + struct xfs_buf *agbp; + int error; + int i; + + error = xfs_ialloc_read_agi(mp, tp, agno, &agbp); + if (error) { + xfs_alert(mp, + "%s: xfs_ialloc_read_agi() returned error %d, agno %d", + __func__, error, agno); + return error; + } + + /* + * Lookup the inode record for the given agino. If the record cannot be + * found, then it's an invalid inode number and we should abort. Once + * we have a record, we need to ensure it contains the inode number + * we are looking up. + */ + cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO); + error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i); + if (!error) { + if (i) + error = xfs_inobt_get_rec(cur, &rec, &i); + if (!error && i == 0) + error = -EINVAL; + } + + xfs_trans_brelse(tp, agbp); + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + if (error) + return error; + + /* check that the returned record contains the required inode */ + if (rec.ir_startino > agino || + rec.ir_startino + mp->m_ialloc_inos <= agino) + return -EINVAL; + + /* for untrusted inodes check it is allocated first */ + if ((flags & XFS_IGET_UNTRUSTED) && + (rec.ir_free & XFS_INOBT_MASK(agino - rec.ir_startino))) + return -EINVAL; + + *chunk_agbno = XFS_AGINO_TO_AGBNO(mp, rec.ir_startino); + *offset_agbno = agbno - *chunk_agbno; + return 0; +} + +/* + * Return the location of the inode in imap, for mapping it into a buffer. + */ +int +xfs_imap( + xfs_mount_t *mp, /* file system mount structure */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_ino_t ino, /* inode to locate */ + struct xfs_imap *imap, /* location map structure */ + uint flags) /* flags for inode btree lookup */ +{ + xfs_agblock_t agbno; /* block number of inode in the alloc group */ + xfs_agino_t agino; /* inode number within alloc group */ + xfs_agnumber_t agno; /* allocation group number */ + int blks_per_cluster; /* num blocks per inode cluster */ + xfs_agblock_t chunk_agbno; /* first block in inode chunk */ + xfs_agblock_t cluster_agbno; /* first block in inode cluster */ + int error; /* error code */ + int offset; /* index of inode in its buffer */ + xfs_agblock_t offset_agbno; /* blks from chunk start to inode */ + + ASSERT(ino != NULLFSINO); + + /* + * Split up the inode number into its parts. + */ + agno = XFS_INO_TO_AGNO(mp, ino); + agino = XFS_INO_TO_AGINO(mp, ino); + agbno = XFS_AGINO_TO_AGBNO(mp, agino); + if (agno >= mp->m_sb.sb_agcount || agbno >= mp->m_sb.sb_agblocks || + ino != XFS_AGINO_TO_INO(mp, agno, agino)) { +#ifdef DEBUG + /* + * Don't output diagnostic information for untrusted inodes + * as they can be invalid without implying corruption. + */ + if (flags & XFS_IGET_UNTRUSTED) + return -EINVAL; + if (agno >= mp->m_sb.sb_agcount) { + xfs_alert(mp, + "%s: agno (%d) >= mp->m_sb.sb_agcount (%d)", + __func__, agno, mp->m_sb.sb_agcount); + } + if (agbno >= mp->m_sb.sb_agblocks) { + xfs_alert(mp, + "%s: agbno (0x%llx) >= mp->m_sb.sb_agblocks (0x%lx)", + __func__, (unsigned long long)agbno, + (unsigned long)mp->m_sb.sb_agblocks); + } + if (ino != XFS_AGINO_TO_INO(mp, agno, agino)) { + xfs_alert(mp, + "%s: ino (0x%llx) != XFS_AGINO_TO_INO() (0x%llx)", + __func__, ino, + XFS_AGINO_TO_INO(mp, agno, agino)); + } + xfs_stack_trace(); +#endif /* DEBUG */ + return -EINVAL; + } + + blks_per_cluster = xfs_icluster_size_fsb(mp); + + /* + * For bulkstat and handle lookups, we have an untrusted inode number + * that we have to verify is valid. We cannot do this just by reading + * the inode buffer as it may have been unlinked and removed leaving + * inodes in stale state on disk. Hence we have to do a btree lookup + * in all cases where an untrusted inode number is passed. + */ + if (flags & XFS_IGET_UNTRUSTED) { + error = xfs_imap_lookup(mp, tp, agno, agino, agbno, + &chunk_agbno, &offset_agbno, flags); + if (error) + return error; + goto out_map; + } + + /* + * If the inode cluster size is the same as the blocksize or + * smaller we get to the buffer by simple arithmetics. + */ + if (blks_per_cluster == 1) { + offset = XFS_INO_TO_OFFSET(mp, ino); + ASSERT(offset < mp->m_sb.sb_inopblock); + + imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, agbno); + imap->im_len = XFS_FSB_TO_BB(mp, 1); + imap->im_boffset = (ushort)(offset << mp->m_sb.sb_inodelog); + return 0; + } + + /* + * If the inode chunks are aligned then use simple maths to + * find the location. Otherwise we have to do a btree + * lookup to find the location. + */ + if (mp->m_inoalign_mask) { + offset_agbno = agbno & mp->m_inoalign_mask; + chunk_agbno = agbno - offset_agbno; + } else { + error = xfs_imap_lookup(mp, tp, agno, agino, agbno, + &chunk_agbno, &offset_agbno, flags); + if (error) + return error; + } + +out_map: + ASSERT(agbno >= chunk_agbno); + cluster_agbno = chunk_agbno + + ((offset_agbno / blks_per_cluster) * blks_per_cluster); + offset = ((agbno - cluster_agbno) * mp->m_sb.sb_inopblock) + + XFS_INO_TO_OFFSET(mp, ino); + + imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, cluster_agbno); + imap->im_len = XFS_FSB_TO_BB(mp, blks_per_cluster); + imap->im_boffset = (ushort)(offset << mp->m_sb.sb_inodelog); + + /* + * If the inode number maps to a block outside the bounds + * of the file system then return NULL rather than calling + * read_buf and panicing when we get an error from the + * driver. + */ + if ((imap->im_blkno + imap->im_len) > + XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) { + xfs_alert(mp, + "%s: (im_blkno (0x%llx) + im_len (0x%llx)) > sb_dblocks (0x%llx)", + __func__, (unsigned long long) imap->im_blkno, + (unsigned long long) imap->im_len, + XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)); + return -EINVAL; + } + return 0; +} + +/* + * Compute and fill in value of m_in_maxlevels. + */ +void +xfs_ialloc_compute_maxlevels( + xfs_mount_t *mp) /* file system mount structure */ +{ + int level; + uint maxblocks; + uint maxleafents; + int minleafrecs; + int minnoderecs; + + maxleafents = (1LL << XFS_INO_AGINO_BITS(mp)) >> + XFS_INODES_PER_CHUNK_LOG; + minleafrecs = mp->m_alloc_mnr[0]; + minnoderecs = mp->m_alloc_mnr[1]; + maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs; + for (level = 1; maxblocks > 1; level++) + maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs; + mp->m_in_maxlevels = level; +} + +/* + * Log specified fields for the ag hdr (inode section). The growth of the agi + * structure over time requires that we interpret the buffer as two logical + * regions delineated by the end of the unlinked list. This is due to the size + * of the hash table and its location in the middle of the agi. + * + * For example, a request to log a field before agi_unlinked and a field after + * agi_unlinked could cause us to log the entire hash table and use an excessive + * amount of log space. To avoid this behavior, log the region up through + * agi_unlinked in one call and the region after agi_unlinked through the end of + * the structure in another. + */ +void +xfs_ialloc_log_agi( + xfs_trans_t *tp, /* transaction pointer */ + xfs_buf_t *bp, /* allocation group header buffer */ + int fields) /* bitmask of fields to log */ +{ + int first; /* first byte number */ + int last; /* last byte number */ + static const short offsets[] = { /* field starting offsets */ + /* keep in sync with bit definitions */ + offsetof(xfs_agi_t, agi_magicnum), + offsetof(xfs_agi_t, agi_versionnum), + offsetof(xfs_agi_t, agi_seqno), + offsetof(xfs_agi_t, agi_length), + offsetof(xfs_agi_t, agi_count), + offsetof(xfs_agi_t, agi_root), + offsetof(xfs_agi_t, agi_level), + offsetof(xfs_agi_t, agi_freecount), + offsetof(xfs_agi_t, agi_newino), + offsetof(xfs_agi_t, agi_dirino), + offsetof(xfs_agi_t, agi_unlinked), + offsetof(xfs_agi_t, agi_free_root), + offsetof(xfs_agi_t, agi_free_level), + sizeof(xfs_agi_t) + }; +#ifdef DEBUG + xfs_agi_t *agi; /* allocation group header */ + + agi = XFS_BUF_TO_AGI(bp); + ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC)); +#endif + + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGI_BUF); + + /* + * Compute byte offsets for the first and last fields in the first + * region and log the agi buffer. This only logs up through + * agi_unlinked. + */ + if (fields & XFS_AGI_ALL_BITS_R1) { + xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS_R1, + &first, &last); + xfs_trans_log_buf(tp, bp, first, last); + } + + /* + * Mask off the bits in the first region and calculate the first and + * last field offsets for any bits in the second region. + */ + fields &= ~XFS_AGI_ALL_BITS_R1; + if (fields) { + xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS_R2, + &first, &last); + xfs_trans_log_buf(tp, bp, first, last); + } +} + +#ifdef DEBUG +STATIC void +xfs_check_agi_unlinked( + struct xfs_agi *agi) +{ + int i; + + for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) + ASSERT(agi->agi_unlinked[i]); +} +#else +#define xfs_check_agi_unlinked(agi) +#endif + +static bool +xfs_agi_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_agi *agi = XFS_BUF_TO_AGI(bp); + + if (xfs_sb_version_hascrc(&mp->m_sb) && + !uuid_equal(&agi->agi_uuid, &mp->m_sb.sb_uuid)) + return false; + /* + * Validate the magic number of the agi block. + */ + if (agi->agi_magicnum != cpu_to_be32(XFS_AGI_MAGIC)) + return false; + if (!XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum))) + return false; + + if (be32_to_cpu(agi->agi_level) > XFS_BTREE_MAXLEVELS) + return false; + /* + * during growfs operations, the perag is not fully initialised, + * so we can't use it for any useful checking. growfs ensures we can't + * use it by using uncached buffers that don't have the perag attached + * so we can detect and avoid this problem. + */ + if (bp->b_pag && be32_to_cpu(agi->agi_seqno) != bp->b_pag->pag_agno) + return false; + + xfs_check_agi_unlinked(agi); + return true; +} + +static void +xfs_agi_read_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + + if (xfs_sb_version_hascrc(&mp->m_sb) && + !xfs_buf_verify_cksum(bp, XFS_AGI_CRC_OFF)) + xfs_buf_ioerror(bp, -EFSBADCRC); + else if (XFS_TEST_ERROR(!xfs_agi_verify(bp), mp, + XFS_ERRTAG_IALLOC_READ_AGI, + XFS_RANDOM_IALLOC_READ_AGI)) + xfs_buf_ioerror(bp, -EFSCORRUPTED); + + if (bp->b_error) + xfs_verifier_error(bp); +} + +static void +xfs_agi_write_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; + + if (!xfs_agi_verify(bp)) { + xfs_buf_ioerror(bp, -EFSCORRUPTED); + xfs_verifier_error(bp); + return; + } + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (bip) + XFS_BUF_TO_AGI(bp)->agi_lsn = cpu_to_be64(bip->bli_item.li_lsn); + xfs_buf_update_cksum(bp, XFS_AGI_CRC_OFF); +} + +const struct xfs_buf_ops xfs_agi_buf_ops = { + .verify_read = xfs_agi_read_verify, + .verify_write = xfs_agi_write_verify, +}; + +/* + * Read in the allocation group header (inode allocation section) + */ +int +xfs_read_agi( + struct xfs_mount *mp, /* file system mount structure */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_agnumber_t agno, /* allocation group number */ + struct xfs_buf **bpp) /* allocation group hdr buf */ +{ + int error; + + trace_xfs_read_agi(mp, agno); + + ASSERT(agno != NULLAGNUMBER); + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, + XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)), + XFS_FSS_TO_BB(mp, 1), 0, bpp, &xfs_agi_buf_ops); + if (error) + return error; + + xfs_buf_set_ref(*bpp, XFS_AGI_REF); + return 0; +} + +int +xfs_ialloc_read_agi( + struct xfs_mount *mp, /* file system mount structure */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_agnumber_t agno, /* allocation group number */ + struct xfs_buf **bpp) /* allocation group hdr buf */ +{ + struct xfs_agi *agi; /* allocation group header */ + struct xfs_perag *pag; /* per allocation group data */ + int error; + + trace_xfs_ialloc_read_agi(mp, agno); + + error = xfs_read_agi(mp, tp, agno, bpp); + if (error) + return error; + + agi = XFS_BUF_TO_AGI(*bpp); + pag = xfs_perag_get(mp, agno); + if (!pag->pagi_init) { + pag->pagi_freecount = be32_to_cpu(agi->agi_freecount); + pag->pagi_count = be32_to_cpu(agi->agi_count); + pag->pagi_init = 1; + } + + /* + * It's possible for these to be out of sync if + * we are in the middle of a forced shutdown. + */ + ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) || + XFS_FORCED_SHUTDOWN(mp)); + xfs_perag_put(pag); + return 0; +} + +/* + * Read in the agi to initialise the per-ag data in the mount structure + */ +int +xfs_ialloc_pagi_init( + xfs_mount_t *mp, /* file system mount structure */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_agnumber_t agno) /* allocation group number */ +{ + xfs_buf_t *bp = NULL; + int error; + + error = xfs_ialloc_read_agi(mp, tp, agno, &bp); + if (error) + return error; + if (bp) + xfs_trans_brelse(tp, bp); + return 0; +} diff --git a/kernel/fs/xfs/libxfs/xfs_ialloc.h b/kernel/fs/xfs/libxfs/xfs_ialloc.h new file mode 100644 index 000000000..100007d56 --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_ialloc.h @@ -0,0 +1,167 @@ +/* + * Copyright (c) 2000,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_IALLOC_H__ +#define __XFS_IALLOC_H__ + +struct xfs_buf; +struct xfs_dinode; +struct xfs_imap; +struct xfs_mount; +struct xfs_trans; +struct xfs_btree_cur; + +/* Move inodes in clusters of this size */ +#define XFS_INODE_BIG_CLUSTER_SIZE 8192 + +/* Calculate and return the number of filesystem blocks per inode cluster */ +static inline int +xfs_icluster_size_fsb( + struct xfs_mount *mp) +{ + if (mp->m_sb.sb_blocksize >= mp->m_inode_cluster_size) + return 1; + return mp->m_inode_cluster_size >> mp->m_sb.sb_blocklog; +} + +/* + * Make an inode pointer out of the buffer/offset. + */ +static inline struct xfs_dinode * +xfs_make_iptr(struct xfs_mount *mp, struct xfs_buf *b, int o) +{ + return (struct xfs_dinode *) + (xfs_buf_offset(b, o << (mp)->m_sb.sb_inodelog)); +} + +/* + * Allocate an inode on disk. + * Mode is used to tell whether the new inode will need space, and whether + * it is a directory. + * + * To work within the constraint of one allocation per transaction, + * xfs_dialloc() is designed to be called twice if it has to do an + * allocation to make more free inodes. If an inode is + * available without an allocation, agbp would be set to the current + * agbp and alloc_done set to false. + * If an allocation needed to be done, agbp would be set to the + * inode header of the allocation group and alloc_done set to true. + * The caller should then commit the current transaction and allocate a new + * transaction. xfs_dialloc() should then be called again with + * the agbp value returned from the previous call. + * + * Once we successfully pick an inode its number is returned and the + * on-disk data structures are updated. The inode itself is not read + * in, since doing so would break ordering constraints with xfs_reclaim. + * + * *agbp should be set to NULL on the first call, *alloc_done set to FALSE. + */ +int /* error */ +xfs_dialloc( + struct xfs_trans *tp, /* transaction pointer */ + xfs_ino_t parent, /* parent inode (directory) */ + umode_t mode, /* mode bits for new inode */ + int okalloc, /* ok to allocate more space */ + struct xfs_buf **agbp, /* buf for a.g. inode header */ + xfs_ino_t *inop); /* inode number allocated */ + +/* + * Free disk inode. Carefully avoids touching the incore inode, all + * manipulations incore are the caller's responsibility. + * The on-disk inode is not changed by this operation, only the + * btree (free inode mask) is changed. + */ +int /* error */ +xfs_difree( + struct xfs_trans *tp, /* transaction pointer */ + xfs_ino_t inode, /* inode to be freed */ + struct xfs_bmap_free *flist, /* extents to free */ + int *deleted, /* set if inode cluster was deleted */ + xfs_ino_t *first_ino); /* first inode in deleted cluster */ + +/* + * Return the location of the inode in imap, for mapping it into a buffer. + */ +int +xfs_imap( + struct xfs_mount *mp, /* file system mount structure */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_ino_t ino, /* inode to locate */ + struct xfs_imap *imap, /* location map structure */ + uint flags); /* flags for inode btree lookup */ + +/* + * Compute and fill in value of m_in_maxlevels. + */ +void +xfs_ialloc_compute_maxlevels( + struct xfs_mount *mp); /* file system mount structure */ + +/* + * Log specified fields for the ag hdr (inode section) + */ +void +xfs_ialloc_log_agi( + struct xfs_trans *tp, /* transaction pointer */ + struct xfs_buf *bp, /* allocation group header buffer */ + int fields); /* bitmask of fields to log */ + +/* + * Read in the allocation group header (inode allocation section) + */ +int /* error */ +xfs_ialloc_read_agi( + struct xfs_mount *mp, /* file system mount structure */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_agnumber_t agno, /* allocation group number */ + struct xfs_buf **bpp); /* allocation group hdr buf */ + +/* + * Read in the allocation group header to initialise the per-ag data + * in the mount structure + */ +int +xfs_ialloc_pagi_init( + struct xfs_mount *mp, /* file system mount structure */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_agnumber_t agno); /* allocation group number */ + +/* + * Lookup a record by ino in the btree given by cur. + */ +int xfs_inobt_lookup(struct xfs_btree_cur *cur, xfs_agino_t ino, + xfs_lookup_t dir, int *stat); + +/* + * Get the data from the pointed-to record. + */ +int xfs_inobt_get_rec(struct xfs_btree_cur *cur, + xfs_inobt_rec_incore_t *rec, int *stat); + +/* + * Inode chunk initialisation routine + */ +int xfs_ialloc_inode_init(struct xfs_mount *mp, struct xfs_trans *tp, + struct list_head *buffer_list, + xfs_agnumber_t agno, xfs_agblock_t agbno, + xfs_agblock_t length, unsigned int gen); + +int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp, + xfs_agnumber_t agno, struct xfs_buf **bpp); + + +#endif /* __XFS_IALLOC_H__ */ diff --git a/kernel/fs/xfs/libxfs/xfs_ialloc_btree.c b/kernel/fs/xfs/libxfs/xfs_ialloc_btree.c new file mode 100644 index 000000000..964c465ca --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -0,0 +1,420 @@ +/* + * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_btree.h" +#include "xfs_ialloc.h" +#include "xfs_ialloc_btree.h" +#include "xfs_alloc.h" +#include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_cksum.h" +#include "xfs_trans.h" + + +STATIC int +xfs_inobt_get_minrecs( + struct xfs_btree_cur *cur, + int level) +{ + return cur->bc_mp->m_inobt_mnr[level != 0]; +} + +STATIC struct xfs_btree_cur * +xfs_inobt_dup_cursor( + struct xfs_btree_cur *cur) +{ + return xfs_inobt_init_cursor(cur->bc_mp, cur->bc_tp, + cur->bc_private.a.agbp, cur->bc_private.a.agno, + cur->bc_btnum); +} + +STATIC void +xfs_inobt_set_root( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *nptr, + int inc) /* level change */ +{ + struct xfs_buf *agbp = cur->bc_private.a.agbp; + struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + + agi->agi_root = nptr->s; + be32_add_cpu(&agi->agi_level, inc); + xfs_ialloc_log_agi(cur->bc_tp, agbp, XFS_AGI_ROOT | XFS_AGI_LEVEL); +} + +STATIC void +xfs_finobt_set_root( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *nptr, + int inc) /* level change */ +{ + struct xfs_buf *agbp = cur->bc_private.a.agbp; + struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + + agi->agi_free_root = nptr->s; + be32_add_cpu(&agi->agi_free_level, inc); + xfs_ialloc_log_agi(cur->bc_tp, agbp, + XFS_AGI_FREE_ROOT | XFS_AGI_FREE_LEVEL); +} + +STATIC int +xfs_inobt_alloc_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *start, + union xfs_btree_ptr *new, + int *stat) +{ + xfs_alloc_arg_t args; /* block allocation args */ + int error; /* error return value */ + xfs_agblock_t sbno = be32_to_cpu(start->s); + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + + memset(&args, 0, sizeof(args)); + args.tp = cur->bc_tp; + args.mp = cur->bc_mp; + args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.a.agno, sbno); + args.minlen = 1; + args.maxlen = 1; + args.prod = 1; + args.type = XFS_ALLOCTYPE_NEAR_BNO; + + error = xfs_alloc_vextent(&args); + if (error) { + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; + } + if (args.fsbno == NULLFSBLOCK) { + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 0; + return 0; + } + ASSERT(args.len == 1); + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + + new->s = cpu_to_be32(XFS_FSB_TO_AGBNO(args.mp, args.fsbno)); + *stat = 1; + return 0; +} + +STATIC int +xfs_inobt_free_block( + struct xfs_btree_cur *cur, + struct xfs_buf *bp) +{ + xfs_fsblock_t fsbno; + int error; + + fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(bp)); + error = xfs_free_extent(cur->bc_tp, fsbno, 1); + if (error) + return error; + + xfs_trans_binval(cur->bc_tp, bp); + return error; +} + +STATIC int +xfs_inobt_get_maxrecs( + struct xfs_btree_cur *cur, + int level) +{ + return cur->bc_mp->m_inobt_mxr[level != 0]; +} + +STATIC void +xfs_inobt_init_key_from_rec( + union xfs_btree_key *key, + union xfs_btree_rec *rec) +{ + key->inobt.ir_startino = rec->inobt.ir_startino; +} + +STATIC void +xfs_inobt_init_rec_from_key( + union xfs_btree_key *key, + union xfs_btree_rec *rec) +{ + rec->inobt.ir_startino = key->inobt.ir_startino; +} + +STATIC void +xfs_inobt_init_rec_from_cur( + struct xfs_btree_cur *cur, + union xfs_btree_rec *rec) +{ + rec->inobt.ir_startino = cpu_to_be32(cur->bc_rec.i.ir_startino); + rec->inobt.ir_freecount = cpu_to_be32(cur->bc_rec.i.ir_freecount); + rec->inobt.ir_free = cpu_to_be64(cur->bc_rec.i.ir_free); +} + +/* + * initial value of ptr for lookup + */ +STATIC void +xfs_inobt_init_ptr_from_cur( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr) +{ + struct xfs_agi *agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp); + + ASSERT(cur->bc_private.a.agno == be32_to_cpu(agi->agi_seqno)); + + ptr->s = agi->agi_root; +} + +STATIC void +xfs_finobt_init_ptr_from_cur( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr) +{ + struct xfs_agi *agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp); + + ASSERT(cur->bc_private.a.agno == be32_to_cpu(agi->agi_seqno)); + ptr->s = agi->agi_free_root; +} + +STATIC __int64_t +xfs_inobt_key_diff( + struct xfs_btree_cur *cur, + union xfs_btree_key *key) +{ + return (__int64_t)be32_to_cpu(key->inobt.ir_startino) - + cur->bc_rec.i.ir_startino; +} + +static int +xfs_inobt_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + struct xfs_perag *pag = bp->b_pag; + unsigned int level; + + /* + * During growfs operations, we can't verify the exact owner as the + * perag is not fully initialised and hence not attached to the buffer. + * + * Similarly, during log recovery we will have a perag structure + * attached, but the agi information will not yet have been initialised + * from the on disk AGI. We don't currently use any of this information, + * but beware of the landmine (i.e. need to check pag->pagi_init) if we + * ever do. + */ + switch (block->bb_magic) { + case cpu_to_be32(XFS_IBT_CRC_MAGIC): + case cpu_to_be32(XFS_FIBT_CRC_MAGIC): + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return false; + if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid)) + return false; + if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn)) + return false; + if (pag && + be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno) + return false; + /* fall through */ + case cpu_to_be32(XFS_IBT_MAGIC): + case cpu_to_be32(XFS_FIBT_MAGIC): + break; + default: + return 0; + } + + /* numrecs and level verification */ + level = be16_to_cpu(block->bb_level); + if (level >= mp->m_in_maxlevels) + return false; + if (be16_to_cpu(block->bb_numrecs) > mp->m_inobt_mxr[level != 0]) + return false; + + /* sibling pointer verification */ + if (!block->bb_u.s.bb_leftsib || + (be32_to_cpu(block->bb_u.s.bb_leftsib) >= mp->m_sb.sb_agblocks && + block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK))) + return false; + if (!block->bb_u.s.bb_rightsib || + (be32_to_cpu(block->bb_u.s.bb_rightsib) >= mp->m_sb.sb_agblocks && + block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK))) + return false; + + return true; +} + +static void +xfs_inobt_read_verify( + struct xfs_buf *bp) +{ + if (!xfs_btree_sblock_verify_crc(bp)) + xfs_buf_ioerror(bp, -EFSBADCRC); + else if (!xfs_inobt_verify(bp)) + xfs_buf_ioerror(bp, -EFSCORRUPTED); + + if (bp->b_error) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + xfs_verifier_error(bp); + } +} + +static void +xfs_inobt_write_verify( + struct xfs_buf *bp) +{ + if (!xfs_inobt_verify(bp)) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + xfs_buf_ioerror(bp, -EFSCORRUPTED); + xfs_verifier_error(bp); + return; + } + xfs_btree_sblock_calc_crc(bp); + +} + +const struct xfs_buf_ops xfs_inobt_buf_ops = { + .verify_read = xfs_inobt_read_verify, + .verify_write = xfs_inobt_write_verify, +}; + +#if defined(DEBUG) || defined(XFS_WARN) +STATIC int +xfs_inobt_keys_inorder( + struct xfs_btree_cur *cur, + union xfs_btree_key *k1, + union xfs_btree_key *k2) +{ + return be32_to_cpu(k1->inobt.ir_startino) < + be32_to_cpu(k2->inobt.ir_startino); +} + +STATIC int +xfs_inobt_recs_inorder( + struct xfs_btree_cur *cur, + union xfs_btree_rec *r1, + union xfs_btree_rec *r2) +{ + return be32_to_cpu(r1->inobt.ir_startino) + XFS_INODES_PER_CHUNK <= + be32_to_cpu(r2->inobt.ir_startino); +} +#endif /* DEBUG */ + +static const struct xfs_btree_ops xfs_inobt_ops = { + .rec_len = sizeof(xfs_inobt_rec_t), + .key_len = sizeof(xfs_inobt_key_t), + + .dup_cursor = xfs_inobt_dup_cursor, + .set_root = xfs_inobt_set_root, + .alloc_block = xfs_inobt_alloc_block, + .free_block = xfs_inobt_free_block, + .get_minrecs = xfs_inobt_get_minrecs, + .get_maxrecs = xfs_inobt_get_maxrecs, + .init_key_from_rec = xfs_inobt_init_key_from_rec, + .init_rec_from_key = xfs_inobt_init_rec_from_key, + .init_rec_from_cur = xfs_inobt_init_rec_from_cur, + .init_ptr_from_cur = xfs_inobt_init_ptr_from_cur, + .key_diff = xfs_inobt_key_diff, + .buf_ops = &xfs_inobt_buf_ops, +#if defined(DEBUG) || defined(XFS_WARN) + .keys_inorder = xfs_inobt_keys_inorder, + .recs_inorder = xfs_inobt_recs_inorder, +#endif +}; + +static const struct xfs_btree_ops xfs_finobt_ops = { + .rec_len = sizeof(xfs_inobt_rec_t), + .key_len = sizeof(xfs_inobt_key_t), + + .dup_cursor = xfs_inobt_dup_cursor, + .set_root = xfs_finobt_set_root, + .alloc_block = xfs_inobt_alloc_block, + .free_block = xfs_inobt_free_block, + .get_minrecs = xfs_inobt_get_minrecs, + .get_maxrecs = xfs_inobt_get_maxrecs, + .init_key_from_rec = xfs_inobt_init_key_from_rec, + .init_rec_from_key = xfs_inobt_init_rec_from_key, + .init_rec_from_cur = xfs_inobt_init_rec_from_cur, + .init_ptr_from_cur = xfs_finobt_init_ptr_from_cur, + .key_diff = xfs_inobt_key_diff, + .buf_ops = &xfs_inobt_buf_ops, +#if defined(DEBUG) || defined(XFS_WARN) + .keys_inorder = xfs_inobt_keys_inorder, + .recs_inorder = xfs_inobt_recs_inorder, +#endif +}; + +/* + * Allocate a new inode btree cursor. + */ +struct xfs_btree_cur * /* new inode btree cursor */ +xfs_inobt_init_cursor( + struct xfs_mount *mp, /* file system mount point */ + struct xfs_trans *tp, /* transaction pointer */ + struct xfs_buf *agbp, /* buffer for agi structure */ + xfs_agnumber_t agno, /* allocation group number */ + xfs_btnum_t btnum) /* ialloc or free ino btree */ +{ + struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + struct xfs_btree_cur *cur; + + cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP); + + cur->bc_tp = tp; + cur->bc_mp = mp; + cur->bc_btnum = btnum; + if (btnum == XFS_BTNUM_INO) { + cur->bc_nlevels = be32_to_cpu(agi->agi_level); + cur->bc_ops = &xfs_inobt_ops; + } else { + cur->bc_nlevels = be32_to_cpu(agi->agi_free_level); + cur->bc_ops = &xfs_finobt_ops; + } + + cur->bc_blocklog = mp->m_sb.sb_blocklog; + + if (xfs_sb_version_hascrc(&mp->m_sb)) + cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; + + cur->bc_private.a.agbp = agbp; + cur->bc_private.a.agno = agno; + + return cur; +} + +/* + * Calculate number of records in an inobt btree block. + */ +int +xfs_inobt_maxrecs( + struct xfs_mount *mp, + int blocklen, + int leaf) +{ + blocklen -= XFS_INOBT_BLOCK_LEN(mp); + + if (leaf) + return blocklen / sizeof(xfs_inobt_rec_t); + return blocklen / (sizeof(xfs_inobt_key_t) + sizeof(xfs_inobt_ptr_t)); +} diff --git a/kernel/fs/xfs/libxfs/xfs_ialloc_btree.h b/kernel/fs/xfs/libxfs/xfs_ialloc_btree.h new file mode 100644 index 000000000..d7ebea72c --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_ialloc_btree.h @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2000,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_IALLOC_BTREE_H__ +#define __XFS_IALLOC_BTREE_H__ + +/* + * Inode map on-disk structures + */ + +struct xfs_buf; +struct xfs_btree_cur; +struct xfs_mount; + +/* + * Btree block header size depends on a superblock flag. + */ +#define XFS_INOBT_BLOCK_LEN(mp) \ + (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \ + XFS_BTREE_SBLOCK_CRC_LEN : XFS_BTREE_SBLOCK_LEN) + +/* + * Record, key, and pointer address macros for btree blocks. + * + * (note that some of these may appear unused, but they are used in userspace) + */ +#define XFS_INOBT_REC_ADDR(mp, block, index) \ + ((xfs_inobt_rec_t *) \ + ((char *)(block) + \ + XFS_INOBT_BLOCK_LEN(mp) + \ + (((index) - 1) * sizeof(xfs_inobt_rec_t)))) + +#define XFS_INOBT_KEY_ADDR(mp, block, index) \ + ((xfs_inobt_key_t *) \ + ((char *)(block) + \ + XFS_INOBT_BLOCK_LEN(mp) + \ + ((index) - 1) * sizeof(xfs_inobt_key_t))) + +#define XFS_INOBT_PTR_ADDR(mp, block, index, maxrecs) \ + ((xfs_inobt_ptr_t *) \ + ((char *)(block) + \ + XFS_INOBT_BLOCK_LEN(mp) + \ + (maxrecs) * sizeof(xfs_inobt_key_t) + \ + ((index) - 1) * sizeof(xfs_inobt_ptr_t))) + +extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_mount *, + struct xfs_trans *, struct xfs_buf *, xfs_agnumber_t, + xfs_btnum_t); +extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int); + +#endif /* __XFS_IALLOC_BTREE_H__ */ diff --git a/kernel/fs/xfs/libxfs/xfs_inode_buf.c b/kernel/fs/xfs/libxfs/xfs_inode_buf.c new file mode 100644 index 000000000..002b6b3a1 --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_inode_buf.c @@ -0,0 +1,476 @@ +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_error.h" +#include "xfs_cksum.h" +#include "xfs_icache.h" +#include "xfs_trans.h" +#include "xfs_ialloc.h" + +/* + * Check that none of the inode's in the buffer have a next + * unlinked field of 0. + */ +#if defined(DEBUG) +void +xfs_inobp_check( + xfs_mount_t *mp, + xfs_buf_t *bp) +{ + int i; + int j; + xfs_dinode_t *dip; + + j = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog; + + for (i = 0; i < j; i++) { + dip = (xfs_dinode_t *)xfs_buf_offset(bp, + i * mp->m_sb.sb_inodesize); + if (!dip->di_next_unlinked) { + xfs_alert(mp, + "Detected bogus zero next_unlinked field in inode %d buffer 0x%llx.", + i, (long long)bp->b_bn); + } + } +} +#endif + +/* + * If we are doing readahead on an inode buffer, we might be in log recovery + * reading an inode allocation buffer that hasn't yet been replayed, and hence + * has not had the inode cores stamped into it. Hence for readahead, the buffer + * may be potentially invalid. + * + * If the readahead buffer is invalid, we don't want to mark it with an error, + * but we do want to clear the DONE status of the buffer so that a followup read + * will re-read it from disk. This will ensure that we don't get an unnecessary + * warnings during log recovery and we don't get unnecssary panics on debug + * kernels. + */ +static void +xfs_inode_buf_verify( + struct xfs_buf *bp, + bool readahead) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + int i; + int ni; + + /* + * Validate the magic number and version of every inode in the buffer + */ + ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock; + for (i = 0; i < ni; i++) { + int di_ok; + xfs_dinode_t *dip; + + dip = (struct xfs_dinode *)xfs_buf_offset(bp, + (i << mp->m_sb.sb_inodelog)); + di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) && + XFS_DINODE_GOOD_VERSION(dip->di_version); + if (unlikely(XFS_TEST_ERROR(!di_ok, mp, + XFS_ERRTAG_ITOBP_INOTOBP, + XFS_RANDOM_ITOBP_INOTOBP))) { + if (readahead) { + bp->b_flags &= ~XBF_DONE; + return; + } + + xfs_buf_ioerror(bp, -EFSCORRUPTED); + xfs_verifier_error(bp); +#ifdef DEBUG + xfs_alert(mp, + "bad inode magic/vsn daddr %lld #%d (magic=%x)", + (unsigned long long)bp->b_bn, i, + be16_to_cpu(dip->di_magic)); +#endif + } + } + xfs_inobp_check(mp, bp); +} + + +static void +xfs_inode_buf_read_verify( + struct xfs_buf *bp) +{ + xfs_inode_buf_verify(bp, false); +} + +static void +xfs_inode_buf_readahead_verify( + struct xfs_buf *bp) +{ + xfs_inode_buf_verify(bp, true); +} + +static void +xfs_inode_buf_write_verify( + struct xfs_buf *bp) +{ + xfs_inode_buf_verify(bp, false); +} + +const struct xfs_buf_ops xfs_inode_buf_ops = { + .verify_read = xfs_inode_buf_read_verify, + .verify_write = xfs_inode_buf_write_verify, +}; + +const struct xfs_buf_ops xfs_inode_buf_ra_ops = { + .verify_read = xfs_inode_buf_readahead_verify, + .verify_write = xfs_inode_buf_write_verify, +}; + + +/* + * This routine is called to map an inode to the buffer containing the on-disk + * version of the inode. It returns a pointer to the buffer containing the + * on-disk inode in the bpp parameter, and in the dipp parameter it returns a + * pointer to the on-disk inode within that buffer. + * + * If a non-zero error is returned, then the contents of bpp and dipp are + * undefined. + */ +int +xfs_imap_to_bp( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_imap *imap, + struct xfs_dinode **dipp, + struct xfs_buf **bpp, + uint buf_flags, + uint iget_flags) +{ + struct xfs_buf *bp; + int error; + + buf_flags |= XBF_UNMAPPED; + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno, + (int)imap->im_len, buf_flags, &bp, + &xfs_inode_buf_ops); + if (error) { + if (error == -EAGAIN) { + ASSERT(buf_flags & XBF_TRYLOCK); + return error; + } + + if (error == -EFSCORRUPTED && + (iget_flags & XFS_IGET_UNTRUSTED)) + return -EINVAL; + + xfs_warn(mp, "%s: xfs_trans_read_buf() returned error %d.", + __func__, error); + return error; + } + + *bpp = bp; + *dipp = (struct xfs_dinode *)xfs_buf_offset(bp, imap->im_boffset); + return 0; +} + +void +xfs_dinode_from_disk( + xfs_icdinode_t *to, + xfs_dinode_t *from) +{ + to->di_magic = be16_to_cpu(from->di_magic); + to->di_mode = be16_to_cpu(from->di_mode); + to->di_version = from ->di_version; + to->di_format = from->di_format; + to->di_onlink = be16_to_cpu(from->di_onlink); + to->di_uid = be32_to_cpu(from->di_uid); + to->di_gid = be32_to_cpu(from->di_gid); + to->di_nlink = be32_to_cpu(from->di_nlink); + to->di_projid_lo = be16_to_cpu(from->di_projid_lo); + to->di_projid_hi = be16_to_cpu(from->di_projid_hi); + memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad)); + to->di_flushiter = be16_to_cpu(from->di_flushiter); + to->di_atime.t_sec = be32_to_cpu(from->di_atime.t_sec); + to->di_atime.t_nsec = be32_to_cpu(from->di_atime.t_nsec); + to->di_mtime.t_sec = be32_to_cpu(from->di_mtime.t_sec); + to->di_mtime.t_nsec = be32_to_cpu(from->di_mtime.t_nsec); + to->di_ctime.t_sec = be32_to_cpu(from->di_ctime.t_sec); + to->di_ctime.t_nsec = be32_to_cpu(from->di_ctime.t_nsec); + to->di_size = be64_to_cpu(from->di_size); + to->di_nblocks = be64_to_cpu(from->di_nblocks); + to->di_extsize = be32_to_cpu(from->di_extsize); + to->di_nextents = be32_to_cpu(from->di_nextents); + to->di_anextents = be16_to_cpu(from->di_anextents); + to->di_forkoff = from->di_forkoff; + to->di_aformat = from->di_aformat; + to->di_dmevmask = be32_to_cpu(from->di_dmevmask); + to->di_dmstate = be16_to_cpu(from->di_dmstate); + to->di_flags = be16_to_cpu(from->di_flags); + to->di_gen = be32_to_cpu(from->di_gen); + + if (to->di_version == 3) { + to->di_changecount = be64_to_cpu(from->di_changecount); + to->di_crtime.t_sec = be32_to_cpu(from->di_crtime.t_sec); + to->di_crtime.t_nsec = be32_to_cpu(from->di_crtime.t_nsec); + to->di_flags2 = be64_to_cpu(from->di_flags2); + to->di_ino = be64_to_cpu(from->di_ino); + to->di_lsn = be64_to_cpu(from->di_lsn); + memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2)); + uuid_copy(&to->di_uuid, &from->di_uuid); + } +} + +void +xfs_dinode_to_disk( + xfs_dinode_t *to, + xfs_icdinode_t *from) +{ + to->di_magic = cpu_to_be16(from->di_magic); + to->di_mode = cpu_to_be16(from->di_mode); + to->di_version = from ->di_version; + to->di_format = from->di_format; + to->di_onlink = cpu_to_be16(from->di_onlink); + to->di_uid = cpu_to_be32(from->di_uid); + to->di_gid = cpu_to_be32(from->di_gid); + to->di_nlink = cpu_to_be32(from->di_nlink); + to->di_projid_lo = cpu_to_be16(from->di_projid_lo); + to->di_projid_hi = cpu_to_be16(from->di_projid_hi); + memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad)); + to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec); + to->di_atime.t_nsec = cpu_to_be32(from->di_atime.t_nsec); + to->di_mtime.t_sec = cpu_to_be32(from->di_mtime.t_sec); + to->di_mtime.t_nsec = cpu_to_be32(from->di_mtime.t_nsec); + to->di_ctime.t_sec = cpu_to_be32(from->di_ctime.t_sec); + to->di_ctime.t_nsec = cpu_to_be32(from->di_ctime.t_nsec); + to->di_size = cpu_to_be64(from->di_size); + to->di_nblocks = cpu_to_be64(from->di_nblocks); + to->di_extsize = cpu_to_be32(from->di_extsize); + to->di_nextents = cpu_to_be32(from->di_nextents); + to->di_anextents = cpu_to_be16(from->di_anextents); + to->di_forkoff = from->di_forkoff; + to->di_aformat = from->di_aformat; + to->di_dmevmask = cpu_to_be32(from->di_dmevmask); + to->di_dmstate = cpu_to_be16(from->di_dmstate); + to->di_flags = cpu_to_be16(from->di_flags); + to->di_gen = cpu_to_be32(from->di_gen); + + if (from->di_version == 3) { + to->di_changecount = cpu_to_be64(from->di_changecount); + to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec); + to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec); + to->di_flags2 = cpu_to_be64(from->di_flags2); + to->di_ino = cpu_to_be64(from->di_ino); + to->di_lsn = cpu_to_be64(from->di_lsn); + memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2)); + uuid_copy(&to->di_uuid, &from->di_uuid); + to->di_flushiter = 0; + } else { + to->di_flushiter = cpu_to_be16(from->di_flushiter); + } +} + +static bool +xfs_dinode_verify( + struct xfs_mount *mp, + struct xfs_inode *ip, + struct xfs_dinode *dip) +{ + if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC)) + return false; + + /* only version 3 or greater inodes are extensively verified here */ + if (dip->di_version < 3) + return true; + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return false; + if (!xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize, + XFS_DINODE_CRC_OFF)) + return false; + if (be64_to_cpu(dip->di_ino) != ip->i_ino) + return false; + if (!uuid_equal(&dip->di_uuid, &mp->m_sb.sb_uuid)) + return false; + return true; +} + +void +xfs_dinode_calc_crc( + struct xfs_mount *mp, + struct xfs_dinode *dip) +{ + __uint32_t crc; + + if (dip->di_version < 3) + return; + + ASSERT(xfs_sb_version_hascrc(&mp->m_sb)); + crc = xfs_start_cksum((char *)dip, mp->m_sb.sb_inodesize, + XFS_DINODE_CRC_OFF); + dip->di_crc = xfs_end_cksum(crc); +} + +/* + * Read the disk inode attributes into the in-core inode structure. + * + * For version 5 superblocks, if we are initialising a new inode and we are not + * utilising the XFS_MOUNT_IKEEP inode cluster mode, we can simple build the new + * inode core with a random generation number. If we are keeping inodes around, + * we need to read the inode cluster to get the existing generation number off + * disk. Further, if we are using version 4 superblocks (i.e. v1/v2 inode + * format) then log recovery is dependent on the di_flushiter field being + * initialised from the current on-disk value and hence we must also read the + * inode off disk. + */ +int +xfs_iread( + xfs_mount_t *mp, + xfs_trans_t *tp, + xfs_inode_t *ip, + uint iget_flags) +{ + xfs_buf_t *bp; + xfs_dinode_t *dip; + int error; + + /* + * Fill in the location information in the in-core inode. + */ + error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, iget_flags); + if (error) + return error; + + /* shortcut IO on inode allocation if possible */ + if ((iget_flags & XFS_IGET_CREATE) && + xfs_sb_version_hascrc(&mp->m_sb) && + !(mp->m_flags & XFS_MOUNT_IKEEP)) { + /* initialise the on-disk inode core */ + memset(&ip->i_d, 0, sizeof(ip->i_d)); + ip->i_d.di_magic = XFS_DINODE_MAGIC; + ip->i_d.di_gen = prandom_u32(); + if (xfs_sb_version_hascrc(&mp->m_sb)) { + ip->i_d.di_version = 3; + ip->i_d.di_ino = ip->i_ino; + uuid_copy(&ip->i_d.di_uuid, &mp->m_sb.sb_uuid); + } else + ip->i_d.di_version = 2; + return 0; + } + + /* + * Get pointers to the on-disk inode and the buffer containing it. + */ + error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &bp, 0, iget_flags); + if (error) + return error; + + /* even unallocated inodes are verified */ + if (!xfs_dinode_verify(mp, ip, dip)) { + xfs_alert(mp, "%s: validation failed for inode %lld failed", + __func__, ip->i_ino); + + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, dip); + error = -EFSCORRUPTED; + goto out_brelse; + } + + /* + * If the on-disk inode is already linked to a directory + * entry, copy all of the inode into the in-core inode. + * xfs_iformat_fork() handles copying in the inode format + * specific information. + * Otherwise, just get the truly permanent information. + */ + if (dip->di_mode) { + xfs_dinode_from_disk(&ip->i_d, dip); + error = xfs_iformat_fork(ip, dip); + if (error) { +#ifdef DEBUG + xfs_alert(mp, "%s: xfs_iformat() returned error %d", + __func__, error); +#endif /* DEBUG */ + goto out_brelse; + } + } else { + /* + * Partial initialisation of the in-core inode. Just the bits + * that xfs_ialloc won't overwrite or relies on being correct. + */ + ip->i_d.di_magic = be16_to_cpu(dip->di_magic); + ip->i_d.di_version = dip->di_version; + ip->i_d.di_gen = be32_to_cpu(dip->di_gen); + ip->i_d.di_flushiter = be16_to_cpu(dip->di_flushiter); + + if (dip->di_version == 3) { + ip->i_d.di_ino = be64_to_cpu(dip->di_ino); + uuid_copy(&ip->i_d.di_uuid, &dip->di_uuid); + } + + /* + * Make sure to pull in the mode here as well in + * case the inode is released without being used. + * This ensures that xfs_inactive() will see that + * the inode is already free and not try to mess + * with the uninitialized part of it. + */ + ip->i_d.di_mode = 0; + } + + /* + * Automatically convert version 1 inode formats in memory to version 2 + * inode format. If the inode is modified, it will get logged and + * rewritten as a version 2 inode. We can do this because we set the + * superblock feature bit for v2 inodes unconditionally during mount + * and it means the reast of the code can assume the inode version is 2 + * or higher. + */ + if (ip->i_d.di_version == 1) { + ip->i_d.di_version = 2; + memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); + ip->i_d.di_nlink = ip->i_d.di_onlink; + ip->i_d.di_onlink = 0; + xfs_set_projid(ip, 0); + } + + ip->i_delayed_blks = 0; + + /* + * Mark the buffer containing the inode as something to keep + * around for a while. This helps to keep recently accessed + * meta-data in-core longer. + */ + xfs_buf_set_ref(bp, XFS_INO_REF); + + /* + * Use xfs_trans_brelse() to release the buffer containing the on-disk + * inode, because it was acquired with xfs_trans_read_buf() in + * xfs_imap_to_bp() above. If tp is NULL, this is just a normal + * brelse(). If we're within a transaction, then xfs_trans_brelse() + * will only release the buffer if it is not dirty within the + * transaction. It will be OK to release the buffer in this case, + * because inodes on disk are never destroyed and we will be locking the + * new in-core inode before putting it in the cache where other + * processes can find it. Thus we don't have to worry about the inode + * being changed just because we released the buffer. + */ + out_brelse: + xfs_trans_brelse(tp, bp); + return error; +} diff --git a/kernel/fs/xfs/libxfs/xfs_inode_buf.h b/kernel/fs/xfs/libxfs/xfs_inode_buf.h new file mode 100644 index 000000000..9308c47f2 --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_inode_buf.h @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_INODE_BUF_H__ +#define __XFS_INODE_BUF_H__ + +struct xfs_inode; +struct xfs_dinode; +struct xfs_icdinode; + +/* + * Inode location information. Stored in the inode and passed to + * xfs_imap_to_bp() to get a buffer and dinode for a given inode. + */ +struct xfs_imap { + xfs_daddr_t im_blkno; /* starting BB of inode chunk */ + ushort im_len; /* length in BBs of inode chunk */ + ushort im_boffset; /* inode offset in block in bytes */ +}; + +int xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *, + struct xfs_imap *, struct xfs_dinode **, + struct xfs_buf **, uint, uint); +int xfs_iread(struct xfs_mount *, struct xfs_trans *, + struct xfs_inode *, uint); +void xfs_dinode_calc_crc(struct xfs_mount *, struct xfs_dinode *); +void xfs_dinode_to_disk(struct xfs_dinode *to, struct xfs_icdinode *from); +void xfs_dinode_from_disk(struct xfs_icdinode *to, struct xfs_dinode *from); + +#if defined(DEBUG) +void xfs_inobp_check(struct xfs_mount *, struct xfs_buf *); +#else +#define xfs_inobp_check(mp, bp) +#endif /* DEBUG */ + +#endif /* __XFS_INODE_BUF_H__ */ diff --git a/kernel/fs/xfs/libxfs/xfs_inode_fork.c b/kernel/fs/xfs/libxfs/xfs_inode_fork.c new file mode 100644 index 000000000..0defbd02f --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_inode_fork.c @@ -0,0 +1,1902 @@ +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include + +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_inode_item.h" +#include "xfs_bmap_btree.h" +#include "xfs_bmap.h" +#include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_attr_sf.h" + +kmem_zone_t *xfs_ifork_zone; + +STATIC int xfs_iformat_local(xfs_inode_t *, xfs_dinode_t *, int, int); +STATIC int xfs_iformat_extents(xfs_inode_t *, xfs_dinode_t *, int); +STATIC int xfs_iformat_btree(xfs_inode_t *, xfs_dinode_t *, int); + +#ifdef DEBUG +/* + * Make sure that the extents in the given memory buffer + * are valid. + */ +void +xfs_validate_extents( + xfs_ifork_t *ifp, + int nrecs, + xfs_exntfmt_t fmt) +{ + xfs_bmbt_irec_t irec; + xfs_bmbt_rec_host_t rec; + int i; + + for (i = 0; i < nrecs; i++) { + xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i); + rec.l0 = get_unaligned(&ep->l0); + rec.l1 = get_unaligned(&ep->l1); + xfs_bmbt_get_all(&rec, &irec); + if (fmt == XFS_EXTFMT_NOSTATE) + ASSERT(irec.br_state == XFS_EXT_NORM); + } +} +#else /* DEBUG */ +#define xfs_validate_extents(ifp, nrecs, fmt) +#endif /* DEBUG */ + + +/* + * Move inode type and inode format specific information from the + * on-disk inode to the in-core inode. For fifos, devs, and sockets + * this means set if_rdev to the proper value. For files, directories, + * and symlinks this means to bring in the in-line data or extent + * pointers. For a file in B-tree format, only the root is immediately + * brought in-core. The rest will be in-lined in if_extents when it + * is first referenced (see xfs_iread_extents()). + */ +int +xfs_iformat_fork( + xfs_inode_t *ip, + xfs_dinode_t *dip) +{ + xfs_attr_shortform_t *atp; + int size; + int error = 0; + xfs_fsize_t di_size; + + if (unlikely(be32_to_cpu(dip->di_nextents) + + be16_to_cpu(dip->di_anextents) > + be64_to_cpu(dip->di_nblocks))) { + xfs_warn(ip->i_mount, + "corrupt dinode %Lu, extent total = %d, nblocks = %Lu.", + (unsigned long long)ip->i_ino, + (int)(be32_to_cpu(dip->di_nextents) + + be16_to_cpu(dip->di_anextents)), + (unsigned long long) + be64_to_cpu(dip->di_nblocks)); + XFS_CORRUPTION_ERROR("xfs_iformat(1)", XFS_ERRLEVEL_LOW, + ip->i_mount, dip); + return -EFSCORRUPTED; + } + + if (unlikely(dip->di_forkoff > ip->i_mount->m_sb.sb_inodesize)) { + xfs_warn(ip->i_mount, "corrupt dinode %Lu, forkoff = 0x%x.", + (unsigned long long)ip->i_ino, + dip->di_forkoff); + XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW, + ip->i_mount, dip); + return -EFSCORRUPTED; + } + + if (unlikely((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) && + !ip->i_mount->m_rtdev_targp)) { + xfs_warn(ip->i_mount, + "corrupt dinode %Lu, has realtime flag set.", + ip->i_ino); + XFS_CORRUPTION_ERROR("xfs_iformat(realtime)", + XFS_ERRLEVEL_LOW, ip->i_mount, dip); + return -EFSCORRUPTED; + } + + switch (ip->i_d.di_mode & S_IFMT) { + case S_IFIFO: + case S_IFCHR: + case S_IFBLK: + case S_IFSOCK: + if (unlikely(dip->di_format != XFS_DINODE_FMT_DEV)) { + XFS_CORRUPTION_ERROR("xfs_iformat(3)", XFS_ERRLEVEL_LOW, + ip->i_mount, dip); + return -EFSCORRUPTED; + } + ip->i_d.di_size = 0; + ip->i_df.if_u2.if_rdev = xfs_dinode_get_rdev(dip); + break; + + case S_IFREG: + case S_IFLNK: + case S_IFDIR: + switch (dip->di_format) { + case XFS_DINODE_FMT_LOCAL: + /* + * no local regular files yet + */ + if (unlikely(S_ISREG(be16_to_cpu(dip->di_mode)))) { + xfs_warn(ip->i_mount, + "corrupt inode %Lu (local format for regular file).", + (unsigned long long) ip->i_ino); + XFS_CORRUPTION_ERROR("xfs_iformat(4)", + XFS_ERRLEVEL_LOW, + ip->i_mount, dip); + return -EFSCORRUPTED; + } + + di_size = be64_to_cpu(dip->di_size); + if (unlikely(di_size < 0 || + di_size > XFS_DFORK_DSIZE(dip, ip->i_mount))) { + xfs_warn(ip->i_mount, + "corrupt inode %Lu (bad size %Ld for local inode).", + (unsigned long long) ip->i_ino, + (long long) di_size); + XFS_CORRUPTION_ERROR("xfs_iformat(5)", + XFS_ERRLEVEL_LOW, + ip->i_mount, dip); + return -EFSCORRUPTED; + } + + size = (int)di_size; + error = xfs_iformat_local(ip, dip, XFS_DATA_FORK, size); + break; + case XFS_DINODE_FMT_EXTENTS: + error = xfs_iformat_extents(ip, dip, XFS_DATA_FORK); + break; + case XFS_DINODE_FMT_BTREE: + error = xfs_iformat_btree(ip, dip, XFS_DATA_FORK); + break; + default: + XFS_ERROR_REPORT("xfs_iformat(6)", XFS_ERRLEVEL_LOW, + ip->i_mount); + return -EFSCORRUPTED; + } + break; + + default: + XFS_ERROR_REPORT("xfs_iformat(7)", XFS_ERRLEVEL_LOW, ip->i_mount); + return -EFSCORRUPTED; + } + if (error) { + return error; + } + if (!XFS_DFORK_Q(dip)) + return 0; + + ASSERT(ip->i_afp == NULL); + ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP | KM_NOFS); + + switch (dip->di_aformat) { + case XFS_DINODE_FMT_LOCAL: + atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip); + size = be16_to_cpu(atp->hdr.totsize); + + if (unlikely(size < sizeof(struct xfs_attr_sf_hdr))) { + xfs_warn(ip->i_mount, + "corrupt inode %Lu (bad attr fork size %Ld).", + (unsigned long long) ip->i_ino, + (long long) size); + XFS_CORRUPTION_ERROR("xfs_iformat(8)", + XFS_ERRLEVEL_LOW, + ip->i_mount, dip); + return -EFSCORRUPTED; + } + + error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size); + break; + case XFS_DINODE_FMT_EXTENTS: + error = xfs_iformat_extents(ip, dip, XFS_ATTR_FORK); + break; + case XFS_DINODE_FMT_BTREE: + error = xfs_iformat_btree(ip, dip, XFS_ATTR_FORK); + break; + default: + error = -EFSCORRUPTED; + break; + } + if (error) { + kmem_zone_free(xfs_ifork_zone, ip->i_afp); + ip->i_afp = NULL; + xfs_idestroy_fork(ip, XFS_DATA_FORK); + } + return error; +} + +/* + * The file is in-lined in the on-disk inode. + * If it fits into if_inline_data, then copy + * it there, otherwise allocate a buffer for it + * and copy the data there. Either way, set + * if_data to point at the data. + * If we allocate a buffer for the data, make + * sure that its size is a multiple of 4 and + * record the real size in i_real_bytes. + */ +STATIC int +xfs_iformat_local( + xfs_inode_t *ip, + xfs_dinode_t *dip, + int whichfork, + int size) +{ + xfs_ifork_t *ifp; + int real_size; + + /* + * If the size is unreasonable, then something + * is wrong and we just bail out rather than crash in + * kmem_alloc() or memcpy() below. + */ + if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) { + xfs_warn(ip->i_mount, + "corrupt inode %Lu (bad size %d for local fork, size = %d).", + (unsigned long long) ip->i_ino, size, + XFS_DFORK_SIZE(dip, ip->i_mount, whichfork)); + XFS_CORRUPTION_ERROR("xfs_iformat_local", XFS_ERRLEVEL_LOW, + ip->i_mount, dip); + return -EFSCORRUPTED; + } + ifp = XFS_IFORK_PTR(ip, whichfork); + real_size = 0; + if (size == 0) + ifp->if_u1.if_data = NULL; + else if (size <= sizeof(ifp->if_u2.if_inline_data)) + ifp->if_u1.if_data = ifp->if_u2.if_inline_data; + else { + real_size = roundup(size, 4); + ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS); + } + ifp->if_bytes = size; + ifp->if_real_bytes = real_size; + if (size) + memcpy(ifp->if_u1.if_data, XFS_DFORK_PTR(dip, whichfork), size); + ifp->if_flags &= ~XFS_IFEXTENTS; + ifp->if_flags |= XFS_IFINLINE; + return 0; +} + +/* + * The file consists of a set of extents all + * of which fit into the on-disk inode. + * If there are few enough extents to fit into + * the if_inline_ext, then copy them there. + * Otherwise allocate a buffer for them and copy + * them into it. Either way, set if_extents + * to point at the extents. + */ +STATIC int +xfs_iformat_extents( + xfs_inode_t *ip, + xfs_dinode_t *dip, + int whichfork) +{ + xfs_bmbt_rec_t *dp; + xfs_ifork_t *ifp; + int nex; + int size; + int i; + + ifp = XFS_IFORK_PTR(ip, whichfork); + nex = XFS_DFORK_NEXTENTS(dip, whichfork); + size = nex * (uint)sizeof(xfs_bmbt_rec_t); + + /* + * If the number of extents is unreasonable, then something + * is wrong and we just bail out rather than crash in + * kmem_alloc() or memcpy() below. + */ + if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) { + xfs_warn(ip->i_mount, "corrupt inode %Lu ((a)extents = %d).", + (unsigned long long) ip->i_ino, nex); + XFS_CORRUPTION_ERROR("xfs_iformat_extents(1)", XFS_ERRLEVEL_LOW, + ip->i_mount, dip); + return -EFSCORRUPTED; + } + + ifp->if_real_bytes = 0; + if (nex == 0) + ifp->if_u1.if_extents = NULL; + else if (nex <= XFS_INLINE_EXTS) + ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext; + else + xfs_iext_add(ifp, 0, nex); + + ifp->if_bytes = size; + if (size) { + dp = (xfs_bmbt_rec_t *) XFS_DFORK_PTR(dip, whichfork); + xfs_validate_extents(ifp, nex, XFS_EXTFMT_INODE(ip)); + for (i = 0; i < nex; i++, dp++) { + xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i); + ep->l0 = get_unaligned_be64(&dp->l0); + ep->l1 = get_unaligned_be64(&dp->l1); + } + XFS_BMAP_TRACE_EXLIST(ip, nex, whichfork); + if (whichfork != XFS_DATA_FORK || + XFS_EXTFMT_INODE(ip) == XFS_EXTFMT_NOSTATE) + if (unlikely(xfs_check_nostate_extents( + ifp, 0, nex))) { + XFS_ERROR_REPORT("xfs_iformat_extents(2)", + XFS_ERRLEVEL_LOW, + ip->i_mount); + return -EFSCORRUPTED; + } + } + ifp->if_flags |= XFS_IFEXTENTS; + return 0; +} + +/* + * The file has too many extents to fit into + * the inode, so they are in B-tree format. + * Allocate a buffer for the root of the B-tree + * and copy the root into it. The i_extents + * field will remain NULL until all of the + * extents are read in (when they are needed). + */ +STATIC int +xfs_iformat_btree( + xfs_inode_t *ip, + xfs_dinode_t *dip, + int whichfork) +{ + struct xfs_mount *mp = ip->i_mount; + xfs_bmdr_block_t *dfp; + xfs_ifork_t *ifp; + /* REFERENCED */ + int nrecs; + int size; + + ifp = XFS_IFORK_PTR(ip, whichfork); + dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork); + size = XFS_BMAP_BROOT_SPACE(mp, dfp); + nrecs = be16_to_cpu(dfp->bb_numrecs); + + /* + * blow out if -- fork has less extents than can fit in + * fork (fork shouldn't be a btree format), root btree + * block has more records than can fit into the fork, + * or the number of extents is greater than the number of + * blocks. + */ + if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <= + XFS_IFORK_MAXEXT(ip, whichfork) || + XFS_BMDR_SPACE_CALC(nrecs) > + XFS_DFORK_SIZE(dip, mp, whichfork) || + XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) { + xfs_warn(mp, "corrupt inode %Lu (btree).", + (unsigned long long) ip->i_ino); + XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW, + mp, dip); + return -EFSCORRUPTED; + } + + ifp->if_broot_bytes = size; + ifp->if_broot = kmem_alloc(size, KM_SLEEP | KM_NOFS); + ASSERT(ifp->if_broot != NULL); + /* + * Copy and convert from the on-disk structure + * to the in-memory structure. + */ + xfs_bmdr_to_bmbt(ip, dfp, XFS_DFORK_SIZE(dip, ip->i_mount, whichfork), + ifp->if_broot, size); + ifp->if_flags &= ~XFS_IFEXTENTS; + ifp->if_flags |= XFS_IFBROOT; + + return 0; +} + +/* + * Read in extents from a btree-format inode. + * Allocate and fill in if_extents. Real work is done in xfs_bmap.c. + */ +int +xfs_iread_extents( + xfs_trans_t *tp, + xfs_inode_t *ip, + int whichfork) +{ + int error; + xfs_ifork_t *ifp; + xfs_extnum_t nextents; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + + if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) { + XFS_ERROR_REPORT("xfs_iread_extents", XFS_ERRLEVEL_LOW, + ip->i_mount); + return -EFSCORRUPTED; + } + nextents = XFS_IFORK_NEXTENTS(ip, whichfork); + ifp = XFS_IFORK_PTR(ip, whichfork); + + /* + * We know that the size is valid (it's checked in iformat_btree) + */ + ifp->if_bytes = ifp->if_real_bytes = 0; + ifp->if_flags |= XFS_IFEXTENTS; + xfs_iext_add(ifp, 0, nextents); + error = xfs_bmap_read_extents(tp, ip, whichfork); + if (error) { + xfs_iext_destroy(ifp); + ifp->if_flags &= ~XFS_IFEXTENTS; + return error; + } + xfs_validate_extents(ifp, nextents, XFS_EXTFMT_INODE(ip)); + return 0; +} +/* + * Reallocate the space for if_broot based on the number of records + * being added or deleted as indicated in rec_diff. Move the records + * and pointers in if_broot to fit the new size. When shrinking this + * will eliminate holes between the records and pointers created by + * the caller. When growing this will create holes to be filled in + * by the caller. + * + * The caller must not request to add more records than would fit in + * the on-disk inode root. If the if_broot is currently NULL, then + * if we are adding records, one will be allocated. The caller must also + * not request that the number of records go below zero, although + * it can go to zero. + * + * ip -- the inode whose if_broot area is changing + * ext_diff -- the change in the number of records, positive or negative, + * requested for the if_broot array. + */ +void +xfs_iroot_realloc( + xfs_inode_t *ip, + int rec_diff, + int whichfork) +{ + struct xfs_mount *mp = ip->i_mount; + int cur_max; + xfs_ifork_t *ifp; + struct xfs_btree_block *new_broot; + int new_max; + size_t new_size; + char *np; + char *op; + + /* + * Handle the degenerate case quietly. + */ + if (rec_diff == 0) { + return; + } + + ifp = XFS_IFORK_PTR(ip, whichfork); + if (rec_diff > 0) { + /* + * If there wasn't any memory allocated before, just + * allocate it now and get out. + */ + if (ifp->if_broot_bytes == 0) { + new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, rec_diff); + ifp->if_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS); + ifp->if_broot_bytes = (int)new_size; + return; + } + + /* + * If there is already an existing if_broot, then we need + * to realloc() it and shift the pointers to their new + * location. The records don't change location because + * they are kept butted up against the btree block header. + */ + cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0); + new_max = cur_max + rec_diff; + new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max); + ifp->if_broot = kmem_realloc(ifp->if_broot, new_size, + XFS_BMAP_BROOT_SPACE_CALC(mp, cur_max), + KM_SLEEP | KM_NOFS); + op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, + ifp->if_broot_bytes); + np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, + (int)new_size); + ifp->if_broot_bytes = (int)new_size; + ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <= + XFS_IFORK_SIZE(ip, whichfork)); + memmove(np, op, cur_max * (uint)sizeof(xfs_fsblock_t)); + return; + } + + /* + * rec_diff is less than 0. In this case, we are shrinking the + * if_broot buffer. It must already exist. If we go to zero + * records, just get rid of the root and clear the status bit. + */ + ASSERT((ifp->if_broot != NULL) && (ifp->if_broot_bytes > 0)); + cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0); + new_max = cur_max + rec_diff; + ASSERT(new_max >= 0); + if (new_max > 0) + new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max); + else + new_size = 0; + if (new_size > 0) { + new_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS); + /* + * First copy over the btree block header. + */ + memcpy(new_broot, ifp->if_broot, + XFS_BMBT_BLOCK_LEN(ip->i_mount)); + } else { + new_broot = NULL; + ifp->if_flags &= ~XFS_IFBROOT; + } + + /* + * Only copy the records and pointers if there are any. + */ + if (new_max > 0) { + /* + * First copy the records. + */ + op = (char *)XFS_BMBT_REC_ADDR(mp, ifp->if_broot, 1); + np = (char *)XFS_BMBT_REC_ADDR(mp, new_broot, 1); + memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_rec_t)); + + /* + * Then copy the pointers. + */ + op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, + ifp->if_broot_bytes); + np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, new_broot, 1, + (int)new_size); + memcpy(np, op, new_max * (uint)sizeof(xfs_fsblock_t)); + } + kmem_free(ifp->if_broot); + ifp->if_broot = new_broot; + ifp->if_broot_bytes = (int)new_size; + if (ifp->if_broot) + ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <= + XFS_IFORK_SIZE(ip, whichfork)); + return; +} + + +/* + * This is called when the amount of space needed for if_data + * is increased or decreased. The change in size is indicated by + * the number of bytes that need to be added or deleted in the + * byte_diff parameter. + * + * If the amount of space needed has decreased below the size of the + * inline buffer, then switch to using the inline buffer. Otherwise, + * use kmem_realloc() or kmem_alloc() to adjust the size of the buffer + * to what is needed. + * + * ip -- the inode whose if_data area is changing + * byte_diff -- the change in the number of bytes, positive or negative, + * requested for the if_data array. + */ +void +xfs_idata_realloc( + xfs_inode_t *ip, + int byte_diff, + int whichfork) +{ + xfs_ifork_t *ifp; + int new_size; + int real_size; + + if (byte_diff == 0) { + return; + } + + ifp = XFS_IFORK_PTR(ip, whichfork); + new_size = (int)ifp->if_bytes + byte_diff; + ASSERT(new_size >= 0); + + if (new_size == 0) { + if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) { + kmem_free(ifp->if_u1.if_data); + } + ifp->if_u1.if_data = NULL; + real_size = 0; + } else if (new_size <= sizeof(ifp->if_u2.if_inline_data)) { + /* + * If the valid extents/data can fit in if_inline_ext/data, + * copy them from the malloc'd vector and free it. + */ + if (ifp->if_u1.if_data == NULL) { + ifp->if_u1.if_data = ifp->if_u2.if_inline_data; + } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) { + ASSERT(ifp->if_real_bytes != 0); + memcpy(ifp->if_u2.if_inline_data, ifp->if_u1.if_data, + new_size); + kmem_free(ifp->if_u1.if_data); + ifp->if_u1.if_data = ifp->if_u2.if_inline_data; + } + real_size = 0; + } else { + /* + * Stuck with malloc/realloc. + * For inline data, the underlying buffer must be + * a multiple of 4 bytes in size so that it can be + * logged and stay on word boundaries. We enforce + * that here. + */ + real_size = roundup(new_size, 4); + if (ifp->if_u1.if_data == NULL) { + ASSERT(ifp->if_real_bytes == 0); + ifp->if_u1.if_data = kmem_alloc(real_size, + KM_SLEEP | KM_NOFS); + } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) { + /* + * Only do the realloc if the underlying size + * is really changing. + */ + if (ifp->if_real_bytes != real_size) { + ifp->if_u1.if_data = + kmem_realloc(ifp->if_u1.if_data, + real_size, + ifp->if_real_bytes, + KM_SLEEP | KM_NOFS); + } + } else { + ASSERT(ifp->if_real_bytes == 0); + ifp->if_u1.if_data = kmem_alloc(real_size, + KM_SLEEP | KM_NOFS); + memcpy(ifp->if_u1.if_data, ifp->if_u2.if_inline_data, + ifp->if_bytes); + } + } + ifp->if_real_bytes = real_size; + ifp->if_bytes = new_size; + ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork)); +} + +void +xfs_idestroy_fork( + xfs_inode_t *ip, + int whichfork) +{ + xfs_ifork_t *ifp; + + ifp = XFS_IFORK_PTR(ip, whichfork); + if (ifp->if_broot != NULL) { + kmem_free(ifp->if_broot); + ifp->if_broot = NULL; + } + + /* + * If the format is local, then we can't have an extents + * array so just look for an inline data array. If we're + * not local then we may or may not have an extents list, + * so check and free it up if we do. + */ + if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { + if ((ifp->if_u1.if_data != ifp->if_u2.if_inline_data) && + (ifp->if_u1.if_data != NULL)) { + ASSERT(ifp->if_real_bytes != 0); + kmem_free(ifp->if_u1.if_data); + ifp->if_u1.if_data = NULL; + ifp->if_real_bytes = 0; + } + } else if ((ifp->if_flags & XFS_IFEXTENTS) && + ((ifp->if_flags & XFS_IFEXTIREC) || + ((ifp->if_u1.if_extents != NULL) && + (ifp->if_u1.if_extents != ifp->if_u2.if_inline_ext)))) { + ASSERT(ifp->if_real_bytes != 0); + xfs_iext_destroy(ifp); + } + ASSERT(ifp->if_u1.if_extents == NULL || + ifp->if_u1.if_extents == ifp->if_u2.if_inline_ext); + ASSERT(ifp->if_real_bytes == 0); + if (whichfork == XFS_ATTR_FORK) { + kmem_zone_free(xfs_ifork_zone, ip->i_afp); + ip->i_afp = NULL; + } +} + +/* + * Convert in-core extents to on-disk form + * + * For either the data or attr fork in extent format, we need to endian convert + * the in-core extent as we place them into the on-disk inode. + * + * In the case of the data fork, the in-core and on-disk fork sizes can be + * different due to delayed allocation extents. We only copy on-disk extents + * here, so callers must always use the physical fork size to determine the + * size of the buffer passed to this routine. We will return the size actually + * used. + */ +int +xfs_iextents_copy( + xfs_inode_t *ip, + xfs_bmbt_rec_t *dp, + int whichfork) +{ + int copied; + int i; + xfs_ifork_t *ifp; + int nrecs; + xfs_fsblock_t start_block; + + ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); + ASSERT(ifp->if_bytes > 0); + + nrecs = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + XFS_BMAP_TRACE_EXLIST(ip, nrecs, whichfork); + ASSERT(nrecs > 0); + + /* + * There are some delayed allocation extents in the + * inode, so copy the extents one at a time and skip + * the delayed ones. There must be at least one + * non-delayed extent. + */ + copied = 0; + for (i = 0; i < nrecs; i++) { + xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i); + start_block = xfs_bmbt_get_startblock(ep); + if (isnullstartblock(start_block)) { + /* + * It's a delayed allocation extent, so skip it. + */ + continue; + } + + /* Translate to on disk format */ + put_unaligned_be64(ep->l0, &dp->l0); + put_unaligned_be64(ep->l1, &dp->l1); + dp++; + copied++; + } + ASSERT(copied != 0); + xfs_validate_extents(ifp, copied, XFS_EXTFMT_INODE(ip)); + + return (copied * (uint)sizeof(xfs_bmbt_rec_t)); +} + +/* + * Each of the following cases stores data into the same region + * of the on-disk inode, so only one of them can be valid at + * any given time. While it is possible to have conflicting formats + * and log flags, e.g. having XFS_ILOG_?DATA set when the fork is + * in EXTENTS format, this can only happen when the fork has + * changed formats after being modified but before being flushed. + * In these cases, the format always takes precedence, because the + * format indicates the current state of the fork. + */ +void +xfs_iflush_fork( + xfs_inode_t *ip, + xfs_dinode_t *dip, + xfs_inode_log_item_t *iip, + int whichfork) +{ + char *cp; + xfs_ifork_t *ifp; + xfs_mount_t *mp; + static const short brootflag[2] = + { XFS_ILOG_DBROOT, XFS_ILOG_ABROOT }; + static const short dataflag[2] = + { XFS_ILOG_DDATA, XFS_ILOG_ADATA }; + static const short extflag[2] = + { XFS_ILOG_DEXT, XFS_ILOG_AEXT }; + + if (!iip) + return; + ifp = XFS_IFORK_PTR(ip, whichfork); + /* + * This can happen if we gave up in iformat in an error path, + * for the attribute fork. + */ + if (!ifp) { + ASSERT(whichfork == XFS_ATTR_FORK); + return; + } + cp = XFS_DFORK_PTR(dip, whichfork); + mp = ip->i_mount; + switch (XFS_IFORK_FORMAT(ip, whichfork)) { + case XFS_DINODE_FMT_LOCAL: + if ((iip->ili_fields & dataflag[whichfork]) && + (ifp->if_bytes > 0)) { + ASSERT(ifp->if_u1.if_data != NULL); + ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork)); + memcpy(cp, ifp->if_u1.if_data, ifp->if_bytes); + } + break; + + case XFS_DINODE_FMT_EXTENTS: + ASSERT((ifp->if_flags & XFS_IFEXTENTS) || + !(iip->ili_fields & extflag[whichfork])); + if ((iip->ili_fields & extflag[whichfork]) && + (ifp->if_bytes > 0)) { + ASSERT(xfs_iext_get_ext(ifp, 0)); + ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0); + (void)xfs_iextents_copy(ip, (xfs_bmbt_rec_t *)cp, + whichfork); + } + break; + + case XFS_DINODE_FMT_BTREE: + if ((iip->ili_fields & brootflag[whichfork]) && + (ifp->if_broot_bytes > 0)) { + ASSERT(ifp->if_broot != NULL); + ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <= + XFS_IFORK_SIZE(ip, whichfork)); + xfs_bmbt_to_bmdr(mp, ifp->if_broot, ifp->if_broot_bytes, + (xfs_bmdr_block_t *)cp, + XFS_DFORK_SIZE(dip, mp, whichfork)); + } + break; + + case XFS_DINODE_FMT_DEV: + if (iip->ili_fields & XFS_ILOG_DEV) { + ASSERT(whichfork == XFS_DATA_FORK); + xfs_dinode_put_rdev(dip, ip->i_df.if_u2.if_rdev); + } + break; + + case XFS_DINODE_FMT_UUID: + if (iip->ili_fields & XFS_ILOG_UUID) { + ASSERT(whichfork == XFS_DATA_FORK); + memcpy(XFS_DFORK_DPTR(dip), + &ip->i_df.if_u2.if_uuid, + sizeof(uuid_t)); + } + break; + + default: + ASSERT(0); + break; + } +} + +/* + * Return a pointer to the extent record at file index idx. + */ +xfs_bmbt_rec_host_t * +xfs_iext_get_ext( + xfs_ifork_t *ifp, /* inode fork pointer */ + xfs_extnum_t idx) /* index of target extent */ +{ + ASSERT(idx >= 0); + ASSERT(idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t)); + + if ((ifp->if_flags & XFS_IFEXTIREC) && (idx == 0)) { + return ifp->if_u1.if_ext_irec->er_extbuf; + } else if (ifp->if_flags & XFS_IFEXTIREC) { + xfs_ext_irec_t *erp; /* irec pointer */ + int erp_idx = 0; /* irec index */ + xfs_extnum_t page_idx = idx; /* ext index in target list */ + + erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 0); + return &erp->er_extbuf[page_idx]; + } else if (ifp->if_bytes) { + return &ifp->if_u1.if_extents[idx]; + } else { + return NULL; + } +} + +/* + * Insert new item(s) into the extent records for incore inode + * fork 'ifp'. 'count' new items are inserted at index 'idx'. + */ +void +xfs_iext_insert( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_extnum_t idx, /* starting index of new items */ + xfs_extnum_t count, /* number of inserted items */ + xfs_bmbt_irec_t *new, /* items to insert */ + int state) /* type of extent conversion */ +{ + xfs_ifork_t *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df; + xfs_extnum_t i; /* extent record index */ + + trace_xfs_iext_insert(ip, idx, new, state, _RET_IP_); + + ASSERT(ifp->if_flags & XFS_IFEXTENTS); + xfs_iext_add(ifp, idx, count); + for (i = idx; i < idx + count; i++, new++) + xfs_bmbt_set_all(xfs_iext_get_ext(ifp, i), new); +} + +/* + * This is called when the amount of space required for incore file + * extents needs to be increased. The ext_diff parameter stores the + * number of new extents being added and the idx parameter contains + * the extent index where the new extents will be added. If the new + * extents are being appended, then we just need to (re)allocate and + * initialize the space. Otherwise, if the new extents are being + * inserted into the middle of the existing entries, a bit more work + * is required to make room for the new extents to be inserted. The + * caller is responsible for filling in the new extent entries upon + * return. + */ +void +xfs_iext_add( + xfs_ifork_t *ifp, /* inode fork pointer */ + xfs_extnum_t idx, /* index to begin adding exts */ + int ext_diff) /* number of extents to add */ +{ + int byte_diff; /* new bytes being added */ + int new_size; /* size of extents after adding */ + xfs_extnum_t nextents; /* number of extents in file */ + + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + ASSERT((idx >= 0) && (idx <= nextents)); + byte_diff = ext_diff * sizeof(xfs_bmbt_rec_t); + new_size = ifp->if_bytes + byte_diff; + /* + * If the new number of extents (nextents + ext_diff) + * fits inside the inode, then continue to use the inline + * extent buffer. + */ + if (nextents + ext_diff <= XFS_INLINE_EXTS) { + if (idx < nextents) { + memmove(&ifp->if_u2.if_inline_ext[idx + ext_diff], + &ifp->if_u2.if_inline_ext[idx], + (nextents - idx) * sizeof(xfs_bmbt_rec_t)); + memset(&ifp->if_u2.if_inline_ext[idx], 0, byte_diff); + } + ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext; + ifp->if_real_bytes = 0; + } + /* + * Otherwise use a linear (direct) extent list. + * If the extents are currently inside the inode, + * xfs_iext_realloc_direct will switch us from + * inline to direct extent allocation mode. + */ + else if (nextents + ext_diff <= XFS_LINEAR_EXTS) { + xfs_iext_realloc_direct(ifp, new_size); + if (idx < nextents) { + memmove(&ifp->if_u1.if_extents[idx + ext_diff], + &ifp->if_u1.if_extents[idx], + (nextents - idx) * sizeof(xfs_bmbt_rec_t)); + memset(&ifp->if_u1.if_extents[idx], 0, byte_diff); + } + } + /* Indirection array */ + else { + xfs_ext_irec_t *erp; + int erp_idx = 0; + int page_idx = idx; + + ASSERT(nextents + ext_diff > XFS_LINEAR_EXTS); + if (ifp->if_flags & XFS_IFEXTIREC) { + erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 1); + } else { + xfs_iext_irec_init(ifp); + ASSERT(ifp->if_flags & XFS_IFEXTIREC); + erp = ifp->if_u1.if_ext_irec; + } + /* Extents fit in target extent page */ + if (erp && erp->er_extcount + ext_diff <= XFS_LINEAR_EXTS) { + if (page_idx < erp->er_extcount) { + memmove(&erp->er_extbuf[page_idx + ext_diff], + &erp->er_extbuf[page_idx], + (erp->er_extcount - page_idx) * + sizeof(xfs_bmbt_rec_t)); + memset(&erp->er_extbuf[page_idx], 0, byte_diff); + } + erp->er_extcount += ext_diff; + xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff); + } + /* Insert a new extent page */ + else if (erp) { + xfs_iext_add_indirect_multi(ifp, + erp_idx, page_idx, ext_diff); + } + /* + * If extent(s) are being appended to the last page in + * the indirection array and the new extent(s) don't fit + * in the page, then erp is NULL and erp_idx is set to + * the next index needed in the indirection array. + */ + else { + uint count = ext_diff; + + while (count) { + erp = xfs_iext_irec_new(ifp, erp_idx); + erp->er_extcount = min(count, XFS_LINEAR_EXTS); + count -= erp->er_extcount; + if (count) + erp_idx++; + } + } + } + ifp->if_bytes = new_size; +} + +/* + * This is called when incore extents are being added to the indirection + * array and the new extents do not fit in the target extent list. The + * erp_idx parameter contains the irec index for the target extent list + * in the indirection array, and the idx parameter contains the extent + * index within the list. The number of extents being added is stored + * in the count parameter. + * + * |-------| |-------| + * | | | | idx - number of extents before idx + * | idx | | count | + * | | | | count - number of extents being inserted at idx + * |-------| |-------| + * | count | | nex2 | nex2 - number of extents after idx + count + * |-------| |-------| + */ +void +xfs_iext_add_indirect_multi( + xfs_ifork_t *ifp, /* inode fork pointer */ + int erp_idx, /* target extent irec index */ + xfs_extnum_t idx, /* index within target list */ + int count) /* new extents being added */ +{ + int byte_diff; /* new bytes being added */ + xfs_ext_irec_t *erp; /* pointer to irec entry */ + xfs_extnum_t ext_diff; /* number of extents to add */ + xfs_extnum_t ext_cnt; /* new extents still needed */ + xfs_extnum_t nex2; /* extents after idx + count */ + xfs_bmbt_rec_t *nex2_ep = NULL; /* temp list for nex2 extents */ + int nlists; /* number of irec's (lists) */ + + ASSERT(ifp->if_flags & XFS_IFEXTIREC); + erp = &ifp->if_u1.if_ext_irec[erp_idx]; + nex2 = erp->er_extcount - idx; + nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; + + /* + * Save second part of target extent list + * (all extents past */ + if (nex2) { + byte_diff = nex2 * sizeof(xfs_bmbt_rec_t); + nex2_ep = (xfs_bmbt_rec_t *) kmem_alloc(byte_diff, KM_NOFS); + memmove(nex2_ep, &erp->er_extbuf[idx], byte_diff); + erp->er_extcount -= nex2; + xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -nex2); + memset(&erp->er_extbuf[idx], 0, byte_diff); + } + + /* + * Add the new extents to the end of the target + * list, then allocate new irec record(s) and + * extent buffer(s) as needed to store the rest + * of the new extents. + */ + ext_cnt = count; + ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS - erp->er_extcount); + if (ext_diff) { + erp->er_extcount += ext_diff; + xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff); + ext_cnt -= ext_diff; + } + while (ext_cnt) { + erp_idx++; + erp = xfs_iext_irec_new(ifp, erp_idx); + ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS); + erp->er_extcount = ext_diff; + xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff); + ext_cnt -= ext_diff; + } + + /* Add nex2 extents back to indirection array */ + if (nex2) { + xfs_extnum_t ext_avail; + int i; + + byte_diff = nex2 * sizeof(xfs_bmbt_rec_t); + ext_avail = XFS_LINEAR_EXTS - erp->er_extcount; + i = 0; + /* + * If nex2 extents fit in the current page, append + * nex2_ep after the new extents. + */ + if (nex2 <= ext_avail) { + i = erp->er_extcount; + } + /* + * Otherwise, check if space is available in the + * next page. + */ + else if ((erp_idx < nlists - 1) && + (nex2 <= (ext_avail = XFS_LINEAR_EXTS - + ifp->if_u1.if_ext_irec[erp_idx+1].er_extcount))) { + erp_idx++; + erp++; + /* Create a hole for nex2 extents */ + memmove(&erp->er_extbuf[nex2], erp->er_extbuf, + erp->er_extcount * sizeof(xfs_bmbt_rec_t)); + } + /* + * Final choice, create a new extent page for + * nex2 extents. + */ + else { + erp_idx++; + erp = xfs_iext_irec_new(ifp, erp_idx); + } + memmove(&erp->er_extbuf[i], nex2_ep, byte_diff); + kmem_free(nex2_ep); + erp->er_extcount += nex2; + xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, nex2); + } +} + +/* + * This is called when the amount of space required for incore file + * extents needs to be decreased. The ext_diff parameter stores the + * number of extents to be removed and the idx parameter contains + * the extent index where the extents will be removed from. + * + * If the amount of space needed has decreased below the linear + * limit, XFS_IEXT_BUFSZ, then switch to using the contiguous + * extent array. Otherwise, use kmem_realloc() to adjust the + * size to what is needed. + */ +void +xfs_iext_remove( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_extnum_t idx, /* index to begin removing exts */ + int ext_diff, /* number of extents to remove */ + int state) /* type of extent conversion */ +{ + xfs_ifork_t *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df; + xfs_extnum_t nextents; /* number of extents in file */ + int new_size; /* size of extents after removal */ + + trace_xfs_iext_remove(ip, idx, state, _RET_IP_); + + ASSERT(ext_diff > 0); + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + new_size = (nextents - ext_diff) * sizeof(xfs_bmbt_rec_t); + + if (new_size == 0) { + xfs_iext_destroy(ifp); + } else if (ifp->if_flags & XFS_IFEXTIREC) { + xfs_iext_remove_indirect(ifp, idx, ext_diff); + } else if (ifp->if_real_bytes) { + xfs_iext_remove_direct(ifp, idx, ext_diff); + } else { + xfs_iext_remove_inline(ifp, idx, ext_diff); + } + ifp->if_bytes = new_size; +} + +/* + * This removes ext_diff extents from the inline buffer, beginning + * at extent index idx. + */ +void +xfs_iext_remove_inline( + xfs_ifork_t *ifp, /* inode fork pointer */ + xfs_extnum_t idx, /* index to begin removing exts */ + int ext_diff) /* number of extents to remove */ +{ + int nextents; /* number of extents in file */ + + ASSERT(!(ifp->if_flags & XFS_IFEXTIREC)); + ASSERT(idx < XFS_INLINE_EXTS); + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + ASSERT(((nextents - ext_diff) > 0) && + (nextents - ext_diff) < XFS_INLINE_EXTS); + + if (idx + ext_diff < nextents) { + memmove(&ifp->if_u2.if_inline_ext[idx], + &ifp->if_u2.if_inline_ext[idx + ext_diff], + (nextents - (idx + ext_diff)) * + sizeof(xfs_bmbt_rec_t)); + memset(&ifp->if_u2.if_inline_ext[nextents - ext_diff], + 0, ext_diff * sizeof(xfs_bmbt_rec_t)); + } else { + memset(&ifp->if_u2.if_inline_ext[idx], 0, + ext_diff * sizeof(xfs_bmbt_rec_t)); + } +} + +/* + * This removes ext_diff extents from a linear (direct) extent list, + * beginning at extent index idx. If the extents are being removed + * from the end of the list (ie. truncate) then we just need to re- + * allocate the list to remove the extra space. Otherwise, if the + * extents are being removed from the middle of the existing extent + * entries, then we first need to move the extent records beginning + * at idx + ext_diff up in the list to overwrite the records being + * removed, then remove the extra space via kmem_realloc. + */ +void +xfs_iext_remove_direct( + xfs_ifork_t *ifp, /* inode fork pointer */ + xfs_extnum_t idx, /* index to begin removing exts */ + int ext_diff) /* number of extents to remove */ +{ + xfs_extnum_t nextents; /* number of extents in file */ + int new_size; /* size of extents after removal */ + + ASSERT(!(ifp->if_flags & XFS_IFEXTIREC)); + new_size = ifp->if_bytes - + (ext_diff * sizeof(xfs_bmbt_rec_t)); + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + + if (new_size == 0) { + xfs_iext_destroy(ifp); + return; + } + /* Move extents up in the list (if needed) */ + if (idx + ext_diff < nextents) { + memmove(&ifp->if_u1.if_extents[idx], + &ifp->if_u1.if_extents[idx + ext_diff], + (nextents - (idx + ext_diff)) * + sizeof(xfs_bmbt_rec_t)); + } + memset(&ifp->if_u1.if_extents[nextents - ext_diff], + 0, ext_diff * sizeof(xfs_bmbt_rec_t)); + /* + * Reallocate the direct extent list. If the extents + * will fit inside the inode then xfs_iext_realloc_direct + * will switch from direct to inline extent allocation + * mode for us. + */ + xfs_iext_realloc_direct(ifp, new_size); + ifp->if_bytes = new_size; +} + +/* + * This is called when incore extents are being removed from the + * indirection array and the extents being removed span multiple extent + * buffers. The idx parameter contains the file extent index where we + * want to begin removing extents, and the count parameter contains + * how many extents need to be removed. + * + * |-------| |-------| + * | nex1 | | | nex1 - number of extents before idx + * |-------| | count | + * | | | | count - number of extents being removed at idx + * | count | |-------| + * | | | nex2 | nex2 - number of extents after idx + count + * |-------| |-------| + */ +void +xfs_iext_remove_indirect( + xfs_ifork_t *ifp, /* inode fork pointer */ + xfs_extnum_t idx, /* index to begin removing extents */ + int count) /* number of extents to remove */ +{ + xfs_ext_irec_t *erp; /* indirection array pointer */ + int erp_idx = 0; /* indirection array index */ + xfs_extnum_t ext_cnt; /* extents left to remove */ + xfs_extnum_t ext_diff; /* extents to remove in current list */ + xfs_extnum_t nex1; /* number of extents before idx */ + xfs_extnum_t nex2; /* extents after idx + count */ + int page_idx = idx; /* index in target extent list */ + + ASSERT(ifp->if_flags & XFS_IFEXTIREC); + erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 0); + ASSERT(erp != NULL); + nex1 = page_idx; + ext_cnt = count; + while (ext_cnt) { + nex2 = MAX((erp->er_extcount - (nex1 + ext_cnt)), 0); + ext_diff = MIN(ext_cnt, (erp->er_extcount - nex1)); + /* + * Check for deletion of entire list; + * xfs_iext_irec_remove() updates extent offsets. + */ + if (ext_diff == erp->er_extcount) { + xfs_iext_irec_remove(ifp, erp_idx); + ext_cnt -= ext_diff; + nex1 = 0; + if (ext_cnt) { + ASSERT(erp_idx < ifp->if_real_bytes / + XFS_IEXT_BUFSZ); + erp = &ifp->if_u1.if_ext_irec[erp_idx]; + nex1 = 0; + continue; + } else { + break; + } + } + /* Move extents up (if needed) */ + if (nex2) { + memmove(&erp->er_extbuf[nex1], + &erp->er_extbuf[nex1 + ext_diff], + nex2 * sizeof(xfs_bmbt_rec_t)); + } + /* Zero out rest of page */ + memset(&erp->er_extbuf[nex1 + nex2], 0, (XFS_IEXT_BUFSZ - + ((nex1 + nex2) * sizeof(xfs_bmbt_rec_t)))); + /* Update remaining counters */ + erp->er_extcount -= ext_diff; + xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -ext_diff); + ext_cnt -= ext_diff; + nex1 = 0; + erp_idx++; + erp++; + } + ifp->if_bytes -= count * sizeof(xfs_bmbt_rec_t); + xfs_iext_irec_compact(ifp); +} + +/* + * Create, destroy, or resize a linear (direct) block of extents. + */ +void +xfs_iext_realloc_direct( + xfs_ifork_t *ifp, /* inode fork pointer */ + int new_size) /* new size of extents after adding */ +{ + int rnew_size; /* real new size of extents */ + + rnew_size = new_size; + + ASSERT(!(ifp->if_flags & XFS_IFEXTIREC) || + ((new_size >= 0) && (new_size <= XFS_IEXT_BUFSZ) && + (new_size != ifp->if_real_bytes))); + + /* Free extent records */ + if (new_size == 0) { + xfs_iext_destroy(ifp); + } + /* Resize direct extent list and zero any new bytes */ + else if (ifp->if_real_bytes) { + /* Check if extents will fit inside the inode */ + if (new_size <= XFS_INLINE_EXTS * sizeof(xfs_bmbt_rec_t)) { + xfs_iext_direct_to_inline(ifp, new_size / + (uint)sizeof(xfs_bmbt_rec_t)); + ifp->if_bytes = new_size; + return; + } + if (!is_power_of_2(new_size)){ + rnew_size = roundup_pow_of_two(new_size); + } + if (rnew_size != ifp->if_real_bytes) { + ifp->if_u1.if_extents = + kmem_realloc(ifp->if_u1.if_extents, + rnew_size, + ifp->if_real_bytes, KM_NOFS); + } + if (rnew_size > ifp->if_real_bytes) { + memset(&ifp->if_u1.if_extents[ifp->if_bytes / + (uint)sizeof(xfs_bmbt_rec_t)], 0, + rnew_size - ifp->if_real_bytes); + } + } + /* Switch from the inline extent buffer to a direct extent list */ + else { + if (!is_power_of_2(new_size)) { + rnew_size = roundup_pow_of_two(new_size); + } + xfs_iext_inline_to_direct(ifp, rnew_size); + } + ifp->if_real_bytes = rnew_size; + ifp->if_bytes = new_size; +} + +/* + * Switch from linear (direct) extent records to inline buffer. + */ +void +xfs_iext_direct_to_inline( + xfs_ifork_t *ifp, /* inode fork pointer */ + xfs_extnum_t nextents) /* number of extents in file */ +{ + ASSERT(ifp->if_flags & XFS_IFEXTENTS); + ASSERT(nextents <= XFS_INLINE_EXTS); + /* + * The inline buffer was zeroed when we switched + * from inline to direct extent allocation mode, + * so we don't need to clear it here. + */ + memcpy(ifp->if_u2.if_inline_ext, ifp->if_u1.if_extents, + nextents * sizeof(xfs_bmbt_rec_t)); + kmem_free(ifp->if_u1.if_extents); + ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext; + ifp->if_real_bytes = 0; +} + +/* + * Switch from inline buffer to linear (direct) extent records. + * new_size should already be rounded up to the next power of 2 + * by the caller (when appropriate), so use new_size as it is. + * However, since new_size may be rounded up, we can't update + * if_bytes here. It is the caller's responsibility to update + * if_bytes upon return. + */ +void +xfs_iext_inline_to_direct( + xfs_ifork_t *ifp, /* inode fork pointer */ + int new_size) /* number of extents in file */ +{ + ifp->if_u1.if_extents = kmem_alloc(new_size, KM_NOFS); + memset(ifp->if_u1.if_extents, 0, new_size); + if (ifp->if_bytes) { + memcpy(ifp->if_u1.if_extents, ifp->if_u2.if_inline_ext, + ifp->if_bytes); + memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS * + sizeof(xfs_bmbt_rec_t)); + } + ifp->if_real_bytes = new_size; +} + +/* + * Resize an extent indirection array to new_size bytes. + */ +STATIC void +xfs_iext_realloc_indirect( + xfs_ifork_t *ifp, /* inode fork pointer */ + int new_size) /* new indirection array size */ +{ + int nlists; /* number of irec's (ex lists) */ + int size; /* current indirection array size */ + + ASSERT(ifp->if_flags & XFS_IFEXTIREC); + nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; + size = nlists * sizeof(xfs_ext_irec_t); + ASSERT(ifp->if_real_bytes); + ASSERT((new_size >= 0) && (new_size != size)); + if (new_size == 0) { + xfs_iext_destroy(ifp); + } else { + ifp->if_u1.if_ext_irec = (xfs_ext_irec_t *) + kmem_realloc(ifp->if_u1.if_ext_irec, + new_size, size, KM_NOFS); + } +} + +/* + * Switch from indirection array to linear (direct) extent allocations. + */ +STATIC void +xfs_iext_indirect_to_direct( + xfs_ifork_t *ifp) /* inode fork pointer */ +{ + xfs_bmbt_rec_host_t *ep; /* extent record pointer */ + xfs_extnum_t nextents; /* number of extents in file */ + int size; /* size of file extents */ + + ASSERT(ifp->if_flags & XFS_IFEXTIREC); + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + ASSERT(nextents <= XFS_LINEAR_EXTS); + size = nextents * sizeof(xfs_bmbt_rec_t); + + xfs_iext_irec_compact_pages(ifp); + ASSERT(ifp->if_real_bytes == XFS_IEXT_BUFSZ); + + ep = ifp->if_u1.if_ext_irec->er_extbuf; + kmem_free(ifp->if_u1.if_ext_irec); + ifp->if_flags &= ~XFS_IFEXTIREC; + ifp->if_u1.if_extents = ep; + ifp->if_bytes = size; + if (nextents < XFS_LINEAR_EXTS) { + xfs_iext_realloc_direct(ifp, size); + } +} + +/* + * Free incore file extents. + */ +void +xfs_iext_destroy( + xfs_ifork_t *ifp) /* inode fork pointer */ +{ + if (ifp->if_flags & XFS_IFEXTIREC) { + int erp_idx; + int nlists; + + nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; + for (erp_idx = nlists - 1; erp_idx >= 0 ; erp_idx--) { + xfs_iext_irec_remove(ifp, erp_idx); + } + ifp->if_flags &= ~XFS_IFEXTIREC; + } else if (ifp->if_real_bytes) { + kmem_free(ifp->if_u1.if_extents); + } else if (ifp->if_bytes) { + memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS * + sizeof(xfs_bmbt_rec_t)); + } + ifp->if_u1.if_extents = NULL; + ifp->if_real_bytes = 0; + ifp->if_bytes = 0; +} + +/* + * Return a pointer to the extent record for file system block bno. + */ +xfs_bmbt_rec_host_t * /* pointer to found extent record */ +xfs_iext_bno_to_ext( + xfs_ifork_t *ifp, /* inode fork pointer */ + xfs_fileoff_t bno, /* block number to search for */ + xfs_extnum_t *idxp) /* index of target extent */ +{ + xfs_bmbt_rec_host_t *base; /* pointer to first extent */ + xfs_filblks_t blockcount = 0; /* number of blocks in extent */ + xfs_bmbt_rec_host_t *ep = NULL; /* pointer to target extent */ + xfs_ext_irec_t *erp = NULL; /* indirection array pointer */ + int high; /* upper boundary in search */ + xfs_extnum_t idx = 0; /* index of target extent */ + int low; /* lower boundary in search */ + xfs_extnum_t nextents; /* number of file extents */ + xfs_fileoff_t startoff = 0; /* start offset of extent */ + + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + if (nextents == 0) { + *idxp = 0; + return NULL; + } + low = 0; + if (ifp->if_flags & XFS_IFEXTIREC) { + /* Find target extent list */ + int erp_idx = 0; + erp = xfs_iext_bno_to_irec(ifp, bno, &erp_idx); + base = erp->er_extbuf; + high = erp->er_extcount - 1; + } else { + base = ifp->if_u1.if_extents; + high = nextents - 1; + } + /* Binary search extent records */ + while (low <= high) { + idx = (low + high) >> 1; + ep = base + idx; + startoff = xfs_bmbt_get_startoff(ep); + blockcount = xfs_bmbt_get_blockcount(ep); + if (bno < startoff) { + high = idx - 1; + } else if (bno >= startoff + blockcount) { + low = idx + 1; + } else { + /* Convert back to file-based extent index */ + if (ifp->if_flags & XFS_IFEXTIREC) { + idx += erp->er_extoff; + } + *idxp = idx; + return ep; + } + } + /* Convert back to file-based extent index */ + if (ifp->if_flags & XFS_IFEXTIREC) { + idx += erp->er_extoff; + } + if (bno >= startoff + blockcount) { + if (++idx == nextents) { + ep = NULL; + } else { + ep = xfs_iext_get_ext(ifp, idx); + } + } + *idxp = idx; + return ep; +} + +/* + * Return a pointer to the indirection array entry containing the + * extent record for filesystem block bno. Store the index of the + * target irec in *erp_idxp. + */ +xfs_ext_irec_t * /* pointer to found extent record */ +xfs_iext_bno_to_irec( + xfs_ifork_t *ifp, /* inode fork pointer */ + xfs_fileoff_t bno, /* block number to search for */ + int *erp_idxp) /* irec index of target ext list */ +{ + xfs_ext_irec_t *erp = NULL; /* indirection array pointer */ + xfs_ext_irec_t *erp_next; /* next indirection array entry */ + int erp_idx; /* indirection array index */ + int nlists; /* number of extent irec's (lists) */ + int high; /* binary search upper limit */ + int low; /* binary search lower limit */ + + ASSERT(ifp->if_flags & XFS_IFEXTIREC); + nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; + erp_idx = 0; + low = 0; + high = nlists - 1; + while (low <= high) { + erp_idx = (low + high) >> 1; + erp = &ifp->if_u1.if_ext_irec[erp_idx]; + erp_next = erp_idx < nlists - 1 ? erp + 1 : NULL; + if (bno < xfs_bmbt_get_startoff(erp->er_extbuf)) { + high = erp_idx - 1; + } else if (erp_next && bno >= + xfs_bmbt_get_startoff(erp_next->er_extbuf)) { + low = erp_idx + 1; + } else { + break; + } + } + *erp_idxp = erp_idx; + return erp; +} + +/* + * Return a pointer to the indirection array entry containing the + * extent record at file extent index *idxp. Store the index of the + * target irec in *erp_idxp and store the page index of the target + * extent record in *idxp. + */ +xfs_ext_irec_t * +xfs_iext_idx_to_irec( + xfs_ifork_t *ifp, /* inode fork pointer */ + xfs_extnum_t *idxp, /* extent index (file -> page) */ + int *erp_idxp, /* pointer to target irec */ + int realloc) /* new bytes were just added */ +{ + xfs_ext_irec_t *prev; /* pointer to previous irec */ + xfs_ext_irec_t *erp = NULL; /* pointer to current irec */ + int erp_idx; /* indirection array index */ + int nlists; /* number of irec's (ex lists) */ + int high; /* binary search upper limit */ + int low; /* binary search lower limit */ + xfs_extnum_t page_idx = *idxp; /* extent index in target list */ + + ASSERT(ifp->if_flags & XFS_IFEXTIREC); + ASSERT(page_idx >= 0); + ASSERT(page_idx <= ifp->if_bytes / sizeof(xfs_bmbt_rec_t)); + ASSERT(page_idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t) || realloc); + + nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; + erp_idx = 0; + low = 0; + high = nlists - 1; + + /* Binary search extent irec's */ + while (low <= high) { + erp_idx = (low + high) >> 1; + erp = &ifp->if_u1.if_ext_irec[erp_idx]; + prev = erp_idx > 0 ? erp - 1 : NULL; + if (page_idx < erp->er_extoff || (page_idx == erp->er_extoff && + realloc && prev && prev->er_extcount < XFS_LINEAR_EXTS)) { + high = erp_idx - 1; + } else if (page_idx > erp->er_extoff + erp->er_extcount || + (page_idx == erp->er_extoff + erp->er_extcount && + !realloc)) { + low = erp_idx + 1; + } else if (page_idx == erp->er_extoff + erp->er_extcount && + erp->er_extcount == XFS_LINEAR_EXTS) { + ASSERT(realloc); + page_idx = 0; + erp_idx++; + erp = erp_idx < nlists ? erp + 1 : NULL; + break; + } else { + page_idx -= erp->er_extoff; + break; + } + } + *idxp = page_idx; + *erp_idxp = erp_idx; + return erp; +} + +/* + * Allocate and initialize an indirection array once the space needed + * for incore extents increases above XFS_IEXT_BUFSZ. + */ +void +xfs_iext_irec_init( + xfs_ifork_t *ifp) /* inode fork pointer */ +{ + xfs_ext_irec_t *erp; /* indirection array pointer */ + xfs_extnum_t nextents; /* number of extents in file */ + + ASSERT(!(ifp->if_flags & XFS_IFEXTIREC)); + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + ASSERT(nextents <= XFS_LINEAR_EXTS); + + erp = kmem_alloc(sizeof(xfs_ext_irec_t), KM_NOFS); + + if (nextents == 0) { + ifp->if_u1.if_extents = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS); + } else if (!ifp->if_real_bytes) { + xfs_iext_inline_to_direct(ifp, XFS_IEXT_BUFSZ); + } else if (ifp->if_real_bytes < XFS_IEXT_BUFSZ) { + xfs_iext_realloc_direct(ifp, XFS_IEXT_BUFSZ); + } + erp->er_extbuf = ifp->if_u1.if_extents; + erp->er_extcount = nextents; + erp->er_extoff = 0; + + ifp->if_flags |= XFS_IFEXTIREC; + ifp->if_real_bytes = XFS_IEXT_BUFSZ; + ifp->if_bytes = nextents * sizeof(xfs_bmbt_rec_t); + ifp->if_u1.if_ext_irec = erp; + + return; +} + +/* + * Allocate and initialize a new entry in the indirection array. + */ +xfs_ext_irec_t * +xfs_iext_irec_new( + xfs_ifork_t *ifp, /* inode fork pointer */ + int erp_idx) /* index for new irec */ +{ + xfs_ext_irec_t *erp; /* indirection array pointer */ + int i; /* loop counter */ + int nlists; /* number of irec's (ex lists) */ + + ASSERT(ifp->if_flags & XFS_IFEXTIREC); + nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; + + /* Resize indirection array */ + xfs_iext_realloc_indirect(ifp, ++nlists * + sizeof(xfs_ext_irec_t)); + /* + * Move records down in the array so the + * new page can use erp_idx. + */ + erp = ifp->if_u1.if_ext_irec; + for (i = nlists - 1; i > erp_idx; i--) { + memmove(&erp[i], &erp[i-1], sizeof(xfs_ext_irec_t)); + } + ASSERT(i == erp_idx); + + /* Initialize new extent record */ + erp = ifp->if_u1.if_ext_irec; + erp[erp_idx].er_extbuf = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS); + ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ; + memset(erp[erp_idx].er_extbuf, 0, XFS_IEXT_BUFSZ); + erp[erp_idx].er_extcount = 0; + erp[erp_idx].er_extoff = erp_idx > 0 ? + erp[erp_idx-1].er_extoff + erp[erp_idx-1].er_extcount : 0; + return (&erp[erp_idx]); +} + +/* + * Remove a record from the indirection array. + */ +void +xfs_iext_irec_remove( + xfs_ifork_t *ifp, /* inode fork pointer */ + int erp_idx) /* irec index to remove */ +{ + xfs_ext_irec_t *erp; /* indirection array pointer */ + int i; /* loop counter */ + int nlists; /* number of irec's (ex lists) */ + + ASSERT(ifp->if_flags & XFS_IFEXTIREC); + nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; + erp = &ifp->if_u1.if_ext_irec[erp_idx]; + if (erp->er_extbuf) { + xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, + -erp->er_extcount); + kmem_free(erp->er_extbuf); + } + /* Compact extent records */ + erp = ifp->if_u1.if_ext_irec; + for (i = erp_idx; i < nlists - 1; i++) { + memmove(&erp[i], &erp[i+1], sizeof(xfs_ext_irec_t)); + } + /* + * Manually free the last extent record from the indirection + * array. A call to xfs_iext_realloc_indirect() with a size + * of zero would result in a call to xfs_iext_destroy() which + * would in turn call this function again, creating a nasty + * infinite loop. + */ + if (--nlists) { + xfs_iext_realloc_indirect(ifp, + nlists * sizeof(xfs_ext_irec_t)); + } else { + kmem_free(ifp->if_u1.if_ext_irec); + } + ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ; +} + +/* + * This is called to clean up large amounts of unused memory allocated + * by the indirection array. Before compacting anything though, verify + * that the indirection array is still needed and switch back to the + * linear extent list (or even the inline buffer) if possible. The + * compaction policy is as follows: + * + * Full Compaction: Extents fit into a single page (or inline buffer) + * Partial Compaction: Extents occupy less than 50% of allocated space + * No Compaction: Extents occupy at least 50% of allocated space + */ +void +xfs_iext_irec_compact( + xfs_ifork_t *ifp) /* inode fork pointer */ +{ + xfs_extnum_t nextents; /* number of extents in file */ + int nlists; /* number of irec's (ex lists) */ + + ASSERT(ifp->if_flags & XFS_IFEXTIREC); + nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + + if (nextents == 0) { + xfs_iext_destroy(ifp); + } else if (nextents <= XFS_INLINE_EXTS) { + xfs_iext_indirect_to_direct(ifp); + xfs_iext_direct_to_inline(ifp, nextents); + } else if (nextents <= XFS_LINEAR_EXTS) { + xfs_iext_indirect_to_direct(ifp); + } else if (nextents < (nlists * XFS_LINEAR_EXTS) >> 1) { + xfs_iext_irec_compact_pages(ifp); + } +} + +/* + * Combine extents from neighboring extent pages. + */ +void +xfs_iext_irec_compact_pages( + xfs_ifork_t *ifp) /* inode fork pointer */ +{ + xfs_ext_irec_t *erp, *erp_next;/* pointers to irec entries */ + int erp_idx = 0; /* indirection array index */ + int nlists; /* number of irec's (ex lists) */ + + ASSERT(ifp->if_flags & XFS_IFEXTIREC); + nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; + while (erp_idx < nlists - 1) { + erp = &ifp->if_u1.if_ext_irec[erp_idx]; + erp_next = erp + 1; + if (erp_next->er_extcount <= + (XFS_LINEAR_EXTS - erp->er_extcount)) { + memcpy(&erp->er_extbuf[erp->er_extcount], + erp_next->er_extbuf, erp_next->er_extcount * + sizeof(xfs_bmbt_rec_t)); + erp->er_extcount += erp_next->er_extcount; + /* + * Free page before removing extent record + * so er_extoffs don't get modified in + * xfs_iext_irec_remove. + */ + kmem_free(erp_next->er_extbuf); + erp_next->er_extbuf = NULL; + xfs_iext_irec_remove(ifp, erp_idx + 1); + nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; + } else { + erp_idx++; + } + } +} + +/* + * This is called to update the er_extoff field in the indirection + * array when extents have been added or removed from one of the + * extent lists. erp_idx contains the irec index to begin updating + * at and ext_diff contains the number of extents that were added + * or removed. + */ +void +xfs_iext_irec_update_extoffs( + xfs_ifork_t *ifp, /* inode fork pointer */ + int erp_idx, /* irec index to update */ + int ext_diff) /* number of new extents */ +{ + int i; /* loop counter */ + int nlists; /* number of irec's (ex lists */ + + ASSERT(ifp->if_flags & XFS_IFEXTIREC); + nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; + for (i = erp_idx; i < nlists; i++) { + ifp->if_u1.if_ext_irec[i].er_extoff += ext_diff; + } +} diff --git a/kernel/fs/xfs/libxfs/xfs_inode_fork.h b/kernel/fs/xfs/libxfs/xfs_inode_fork.h new file mode 100644 index 000000000..7d3b1ed6d --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_inode_fork.h @@ -0,0 +1,171 @@ +/* + * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_INODE_FORK_H__ +#define __XFS_INODE_FORK_H__ + +struct xfs_inode_log_item; +struct xfs_dinode; + +/* + * The following xfs_ext_irec_t struct introduces a second (top) level + * to the in-core extent allocation scheme. These structs are allocated + * in a contiguous block, creating an indirection array where each entry + * (irec) contains a pointer to a buffer of in-core extent records which + * it manages. Each extent buffer is 4k in size, since 4k is the system + * page size on Linux i386 and systems with larger page sizes don't seem + * to gain much, if anything, by using their native page size as the + * extent buffer size. Also, using 4k extent buffers everywhere provides + * a consistent interface for CXFS across different platforms. + * + * There is currently no limit on the number of irec's (extent lists) + * allowed, so heavily fragmented files may require an indirection array + * which spans multiple system pages of memory. The number of extents + * which would require this amount of contiguous memory is very large + * and should not cause problems in the foreseeable future. However, + * if the memory needed for the contiguous array ever becomes a problem, + * it is possible that a third level of indirection may be required. + */ +typedef struct xfs_ext_irec { + xfs_bmbt_rec_host_t *er_extbuf; /* block of extent records */ + xfs_extnum_t er_extoff; /* extent offset in file */ + xfs_extnum_t er_extcount; /* number of extents in page/block */ +} xfs_ext_irec_t; + +/* + * File incore extent information, present for each of data & attr forks. + */ +#define XFS_IEXT_BUFSZ 4096 +#define XFS_LINEAR_EXTS (XFS_IEXT_BUFSZ / (uint)sizeof(xfs_bmbt_rec_t)) +#define XFS_INLINE_EXTS 2 +#define XFS_INLINE_DATA 32 +typedef struct xfs_ifork { + int if_bytes; /* bytes in if_u1 */ + int if_real_bytes; /* bytes allocated in if_u1 */ + struct xfs_btree_block *if_broot; /* file's incore btree root */ + short if_broot_bytes; /* bytes allocated for root */ + unsigned char if_flags; /* per-fork flags */ + union { + xfs_bmbt_rec_host_t *if_extents;/* linear map file exts */ + xfs_ext_irec_t *if_ext_irec; /* irec map file exts */ + char *if_data; /* inline file data */ + } if_u1; + union { + xfs_bmbt_rec_host_t if_inline_ext[XFS_INLINE_EXTS]; + /* very small file extents */ + char if_inline_data[XFS_INLINE_DATA]; + /* very small file data */ + xfs_dev_t if_rdev; /* dev number if special */ + uuid_t if_uuid; /* mount point value */ + } if_u2; +} xfs_ifork_t; + +/* + * Per-fork incore inode flags. + */ +#define XFS_IFINLINE 0x01 /* Inline data is read in */ +#define XFS_IFEXTENTS 0x02 /* All extent pointers are read in */ +#define XFS_IFBROOT 0x04 /* i_broot points to the bmap b-tree root */ +#define XFS_IFEXTIREC 0x08 /* Indirection array of extent blocks */ + +/* + * Fork handling. + */ + +#define XFS_IFORK_Q(ip) ((ip)->i_d.di_forkoff != 0) +#define XFS_IFORK_BOFF(ip) ((int)((ip)->i_d.di_forkoff << 3)) + +#define XFS_IFORK_PTR(ip,w) \ + ((w) == XFS_DATA_FORK ? \ + &(ip)->i_df : \ + (ip)->i_afp) +#define XFS_IFORK_DSIZE(ip) \ + (XFS_IFORK_Q(ip) ? \ + XFS_IFORK_BOFF(ip) : \ + XFS_LITINO((ip)->i_mount, (ip)->i_d.di_version)) +#define XFS_IFORK_ASIZE(ip) \ + (XFS_IFORK_Q(ip) ? \ + XFS_LITINO((ip)->i_mount, (ip)->i_d.di_version) - \ + XFS_IFORK_BOFF(ip) : \ + 0) +#define XFS_IFORK_SIZE(ip,w) \ + ((w) == XFS_DATA_FORK ? \ + XFS_IFORK_DSIZE(ip) : \ + XFS_IFORK_ASIZE(ip)) +#define XFS_IFORK_FORMAT(ip,w) \ + ((w) == XFS_DATA_FORK ? \ + (ip)->i_d.di_format : \ + (ip)->i_d.di_aformat) +#define XFS_IFORK_FMT_SET(ip,w,n) \ + ((w) == XFS_DATA_FORK ? \ + ((ip)->i_d.di_format = (n)) : \ + ((ip)->i_d.di_aformat = (n))) +#define XFS_IFORK_NEXTENTS(ip,w) \ + ((w) == XFS_DATA_FORK ? \ + (ip)->i_d.di_nextents : \ + (ip)->i_d.di_anextents) +#define XFS_IFORK_NEXT_SET(ip,w,n) \ + ((w) == XFS_DATA_FORK ? \ + ((ip)->i_d.di_nextents = (n)) : \ + ((ip)->i_d.di_anextents = (n))) +#define XFS_IFORK_MAXEXT(ip, w) \ + (XFS_IFORK_SIZE(ip, w) / sizeof(xfs_bmbt_rec_t)) + +int xfs_iformat_fork(struct xfs_inode *, struct xfs_dinode *); +void xfs_iflush_fork(struct xfs_inode *, struct xfs_dinode *, + struct xfs_inode_log_item *, int); +void xfs_idestroy_fork(struct xfs_inode *, int); +void xfs_idata_realloc(struct xfs_inode *, int, int); +void xfs_iroot_realloc(struct xfs_inode *, int, int); +int xfs_iread_extents(struct xfs_trans *, struct xfs_inode *, int); +int xfs_iextents_copy(struct xfs_inode *, struct xfs_bmbt_rec *, + int); + +struct xfs_bmbt_rec_host * + xfs_iext_get_ext(struct xfs_ifork *, xfs_extnum_t); +void xfs_iext_insert(struct xfs_inode *, xfs_extnum_t, xfs_extnum_t, + struct xfs_bmbt_irec *, int); +void xfs_iext_add(struct xfs_ifork *, xfs_extnum_t, int); +void xfs_iext_add_indirect_multi(struct xfs_ifork *, int, + xfs_extnum_t, int); +void xfs_iext_remove(struct xfs_inode *, xfs_extnum_t, int, int); +void xfs_iext_remove_inline(struct xfs_ifork *, xfs_extnum_t, int); +void xfs_iext_remove_direct(struct xfs_ifork *, xfs_extnum_t, int); +void xfs_iext_remove_indirect(struct xfs_ifork *, xfs_extnum_t, int); +void xfs_iext_realloc_direct(struct xfs_ifork *, int); +void xfs_iext_direct_to_inline(struct xfs_ifork *, xfs_extnum_t); +void xfs_iext_inline_to_direct(struct xfs_ifork *, int); +void xfs_iext_destroy(struct xfs_ifork *); +struct xfs_bmbt_rec_host * + xfs_iext_bno_to_ext(struct xfs_ifork *, xfs_fileoff_t, int *); +struct xfs_ext_irec * + xfs_iext_bno_to_irec(struct xfs_ifork *, xfs_fileoff_t, int *); +struct xfs_ext_irec * + xfs_iext_idx_to_irec(struct xfs_ifork *, xfs_extnum_t *, int *, + int); +void xfs_iext_irec_init(struct xfs_ifork *); +struct xfs_ext_irec * + xfs_iext_irec_new(struct xfs_ifork *, int); +void xfs_iext_irec_remove(struct xfs_ifork *, int); +void xfs_iext_irec_compact(struct xfs_ifork *); +void xfs_iext_irec_compact_pages(struct xfs_ifork *); +void xfs_iext_irec_compact_full(struct xfs_ifork *); +void xfs_iext_irec_update_extoffs(struct xfs_ifork *, int, int); + +extern struct kmem_zone *xfs_ifork_zone; + +#endif /* __XFS_INODE_FORK_H__ */ diff --git a/kernel/fs/xfs/libxfs/xfs_log_format.h b/kernel/fs/xfs/libxfs/xfs_log_format.h new file mode 100644 index 000000000..265314690 --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_log_format.h @@ -0,0 +1,679 @@ +/* + * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_LOG_FORMAT_H__ +#define __XFS_LOG_FORMAT_H__ + +struct xfs_mount; +struct xfs_trans_res; + +/* + * On-disk Log Format definitions. + * + * This file contains all the on-disk format definitions used within the log. It + * includes the physical log structure itself, as well as all the log item + * format structures that are written into the log and intepreted by log + * recovery. We start with the physical log format definitions, and then work + * through all the log items definitions and everything they encode into the + * log. + */ +typedef __uint32_t xlog_tid_t; + +#define XLOG_MIN_ICLOGS 2 +#define XLOG_MAX_ICLOGS 8 +#define XLOG_HEADER_MAGIC_NUM 0xFEEDbabe /* Invalid cycle number */ +#define XLOG_VERSION_1 1 +#define XLOG_VERSION_2 2 /* Large IClogs, Log sunit */ +#define XLOG_VERSION_OKBITS (XLOG_VERSION_1 | XLOG_VERSION_2) +#define XLOG_MIN_RECORD_BSIZE (16*1024) /* eventually 32k */ +#define XLOG_BIG_RECORD_BSIZE (32*1024) /* 32k buffers */ +#define XLOG_MAX_RECORD_BSIZE (256*1024) +#define XLOG_HEADER_CYCLE_SIZE (32*1024) /* cycle data in header */ +#define XLOG_MIN_RECORD_BSHIFT 14 /* 16384 == 1 << 14 */ +#define XLOG_BIG_RECORD_BSHIFT 15 /* 32k == 1 << 15 */ +#define XLOG_MAX_RECORD_BSHIFT 18 /* 256k == 1 << 18 */ +#define XLOG_BTOLSUNIT(log, b) (((b)+(log)->l_mp->m_sb.sb_logsunit-1) / \ + (log)->l_mp->m_sb.sb_logsunit) +#define XLOG_LSUNITTOB(log, su) ((su) * (log)->l_mp->m_sb.sb_logsunit) + +#define XLOG_HEADER_SIZE 512 + +/* Minimum number of transactions that must fit in the log (defined by mkfs) */ +#define XFS_MIN_LOG_FACTOR 3 + +#define XLOG_REC_SHIFT(log) \ + BTOBB(1 << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \ + XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT)) +#define XLOG_TOTAL_REC_SHIFT(log) \ + BTOBB(XLOG_MAX_ICLOGS << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \ + XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT)) + +/* get lsn fields */ +#define CYCLE_LSN(lsn) ((uint)((lsn)>>32)) +#define BLOCK_LSN(lsn) ((uint)(lsn)) + +/* this is used in a spot where we might otherwise double-endian-flip */ +#define CYCLE_LSN_DISK(lsn) (((__be32 *)&(lsn))[0]) + +static inline xfs_lsn_t xlog_assign_lsn(uint cycle, uint block) +{ + return ((xfs_lsn_t)cycle << 32) | block; +} + +static inline uint xlog_get_cycle(char *ptr) +{ + if (be32_to_cpu(*(__be32 *)ptr) == XLOG_HEADER_MAGIC_NUM) + return be32_to_cpu(*((__be32 *)ptr + 1)); + else + return be32_to_cpu(*(__be32 *)ptr); +} + +/* Log Clients */ +#define XFS_TRANSACTION 0x69 +#define XFS_VOLUME 0x2 +#define XFS_LOG 0xaa + +#define XLOG_UNMOUNT_TYPE 0x556e /* Un for Unmount */ + +/* Region types for iovec's i_type */ +#define XLOG_REG_TYPE_BFORMAT 1 +#define XLOG_REG_TYPE_BCHUNK 2 +#define XLOG_REG_TYPE_EFI_FORMAT 3 +#define XLOG_REG_TYPE_EFD_FORMAT 4 +#define XLOG_REG_TYPE_IFORMAT 5 +#define XLOG_REG_TYPE_ICORE 6 +#define XLOG_REG_TYPE_IEXT 7 +#define XLOG_REG_TYPE_IBROOT 8 +#define XLOG_REG_TYPE_ILOCAL 9 +#define XLOG_REG_TYPE_IATTR_EXT 10 +#define XLOG_REG_TYPE_IATTR_BROOT 11 +#define XLOG_REG_TYPE_IATTR_LOCAL 12 +#define XLOG_REG_TYPE_QFORMAT 13 +#define XLOG_REG_TYPE_DQUOT 14 +#define XLOG_REG_TYPE_QUOTAOFF 15 +#define XLOG_REG_TYPE_LRHEADER 16 +#define XLOG_REG_TYPE_UNMOUNT 17 +#define XLOG_REG_TYPE_COMMIT 18 +#define XLOG_REG_TYPE_TRANSHDR 19 +#define XLOG_REG_TYPE_ICREATE 20 +#define XLOG_REG_TYPE_MAX 20 + +/* + * Flags to log operation header + * + * The first write of a new transaction will be preceded with a start + * record, XLOG_START_TRANS. Once a transaction is committed, a commit + * record is written, XLOG_COMMIT_TRANS. If a single region can not fit into + * the remainder of the current active in-core log, it is split up into + * multiple regions. Each partial region will be marked with a + * XLOG_CONTINUE_TRANS until the last one, which gets marked with XLOG_END_TRANS. + * + */ +#define XLOG_START_TRANS 0x01 /* Start a new transaction */ +#define XLOG_COMMIT_TRANS 0x02 /* Commit this transaction */ +#define XLOG_CONTINUE_TRANS 0x04 /* Cont this trans into new region */ +#define XLOG_WAS_CONT_TRANS 0x08 /* Cont this trans into new region */ +#define XLOG_END_TRANS 0x10 /* End a continued transaction */ +#define XLOG_UNMOUNT_TRANS 0x20 /* Unmount a filesystem transaction */ + + +typedef struct xlog_op_header { + __be32 oh_tid; /* transaction id of operation : 4 b */ + __be32 oh_len; /* bytes in data region : 4 b */ + __u8 oh_clientid; /* who sent me this : 1 b */ + __u8 oh_flags; /* : 1 b */ + __u16 oh_res2; /* 32 bit align : 2 b */ +} xlog_op_header_t; + +/* valid values for h_fmt */ +#define XLOG_FMT_UNKNOWN 0 +#define XLOG_FMT_LINUX_LE 1 +#define XLOG_FMT_LINUX_BE 2 +#define XLOG_FMT_IRIX_BE 3 + +/* our fmt */ +#ifdef XFS_NATIVE_HOST +#define XLOG_FMT XLOG_FMT_LINUX_BE +#else +#define XLOG_FMT XLOG_FMT_LINUX_LE +#endif + +typedef struct xlog_rec_header { + __be32 h_magicno; /* log record (LR) identifier : 4 */ + __be32 h_cycle; /* write cycle of log : 4 */ + __be32 h_version; /* LR version : 4 */ + __be32 h_len; /* len in bytes; should be 64-bit aligned: 4 */ + __be64 h_lsn; /* lsn of this LR : 8 */ + __be64 h_tail_lsn; /* lsn of 1st LR w/ buffers not committed: 8 */ + __le32 h_crc; /* crc of log record : 4 */ + __be32 h_prev_block; /* block number to previous LR : 4 */ + __be32 h_num_logops; /* number of log operations in this LR : 4 */ + __be32 h_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE]; + /* new fields */ + __be32 h_fmt; /* format of log record : 4 */ + uuid_t h_fs_uuid; /* uuid of FS : 16 */ + __be32 h_size; /* iclog size : 4 */ +} xlog_rec_header_t; + +typedef struct xlog_rec_ext_header { + __be32 xh_cycle; /* write cycle of log : 4 */ + __be32 xh_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE]; /* : 256 */ +} xlog_rec_ext_header_t; + +/* + * Quite misnamed, because this union lays out the actual on-disk log buffer. + */ +typedef union xlog_in_core2 { + xlog_rec_header_t hic_header; + xlog_rec_ext_header_t hic_xheader; + char hic_sector[XLOG_HEADER_SIZE]; +} xlog_in_core_2_t; + +/* not an on-disk structure, but needed by log recovery in userspace */ +typedef struct xfs_log_iovec { + void *i_addr; /* beginning address of region */ + int i_len; /* length in bytes of region */ + uint i_type; /* type of region */ +} xfs_log_iovec_t; + + +/* + * Transaction Header definitions. + * + * This is the structure written in the log at the head of every transaction. It + * identifies the type and id of the transaction, and contains the number of + * items logged by the transaction so we know how many to expect during + * recovery. + * + * Do not change the below structure without redoing the code in + * xlog_recover_add_to_trans() and xlog_recover_add_to_cont_trans(). + */ +typedef struct xfs_trans_header { + uint th_magic; /* magic number */ + uint th_type; /* transaction type */ + __int32_t th_tid; /* transaction id (unused) */ + uint th_num_items; /* num items logged by trans */ +} xfs_trans_header_t; + +#define XFS_TRANS_HEADER_MAGIC 0x5452414e /* TRAN */ + +/* + * Log item types. + */ +#define XFS_LI_EFI 0x1236 +#define XFS_LI_EFD 0x1237 +#define XFS_LI_IUNLINK 0x1238 +#define XFS_LI_INODE 0x123b /* aligned ino chunks, var-size ibufs */ +#define XFS_LI_BUF 0x123c /* v2 bufs, variable sized inode bufs */ +#define XFS_LI_DQUOT 0x123d +#define XFS_LI_QUOTAOFF 0x123e +#define XFS_LI_ICREATE 0x123f + +#define XFS_LI_TYPE_DESC \ + { XFS_LI_EFI, "XFS_LI_EFI" }, \ + { XFS_LI_EFD, "XFS_LI_EFD" }, \ + { XFS_LI_IUNLINK, "XFS_LI_IUNLINK" }, \ + { XFS_LI_INODE, "XFS_LI_INODE" }, \ + { XFS_LI_BUF, "XFS_LI_BUF" }, \ + { XFS_LI_DQUOT, "XFS_LI_DQUOT" }, \ + { XFS_LI_QUOTAOFF, "XFS_LI_QUOTAOFF" }, \ + { XFS_LI_ICREATE, "XFS_LI_ICREATE" } + +/* + * Inode Log Item Format definitions. + * + * This is the structure used to lay out an inode log item in the + * log. The size of the inline data/extents/b-tree root to be logged + * (if any) is indicated in the ilf_dsize field. Changes to this structure + * must be added on to the end. + */ +typedef struct xfs_inode_log_format { + __uint16_t ilf_type; /* inode log item type */ + __uint16_t ilf_size; /* size of this item */ + __uint32_t ilf_fields; /* flags for fields logged */ + __uint16_t ilf_asize; /* size of attr d/ext/root */ + __uint16_t ilf_dsize; /* size of data/ext/root */ + __uint64_t ilf_ino; /* inode number */ + union { + __uint32_t ilfu_rdev; /* rdev value for dev inode*/ + uuid_t ilfu_uuid; /* mount point value */ + } ilf_u; + __int64_t ilf_blkno; /* blkno of inode buffer */ + __int32_t ilf_len; /* len of inode buffer */ + __int32_t ilf_boffset; /* off of inode in buffer */ +} xfs_inode_log_format_t; + +typedef struct xfs_inode_log_format_32 { + __uint16_t ilf_type; /* inode log item type */ + __uint16_t ilf_size; /* size of this item */ + __uint32_t ilf_fields; /* flags for fields logged */ + __uint16_t ilf_asize; /* size of attr d/ext/root */ + __uint16_t ilf_dsize; /* size of data/ext/root */ + __uint64_t ilf_ino; /* inode number */ + union { + __uint32_t ilfu_rdev; /* rdev value for dev inode*/ + uuid_t ilfu_uuid; /* mount point value */ + } ilf_u; + __int64_t ilf_blkno; /* blkno of inode buffer */ + __int32_t ilf_len; /* len of inode buffer */ + __int32_t ilf_boffset; /* off of inode in buffer */ +} __attribute__((packed)) xfs_inode_log_format_32_t; + +typedef struct xfs_inode_log_format_64 { + __uint16_t ilf_type; /* inode log item type */ + __uint16_t ilf_size; /* size of this item */ + __uint32_t ilf_fields; /* flags for fields logged */ + __uint16_t ilf_asize; /* size of attr d/ext/root */ + __uint16_t ilf_dsize; /* size of data/ext/root */ + __uint32_t ilf_pad; /* pad for 64 bit boundary */ + __uint64_t ilf_ino; /* inode number */ + union { + __uint32_t ilfu_rdev; /* rdev value for dev inode*/ + uuid_t ilfu_uuid; /* mount point value */ + } ilf_u; + __int64_t ilf_blkno; /* blkno of inode buffer */ + __int32_t ilf_len; /* len of inode buffer */ + __int32_t ilf_boffset; /* off of inode in buffer */ +} xfs_inode_log_format_64_t; + +/* + * Flags for xfs_trans_log_inode flags field. + */ +#define XFS_ILOG_CORE 0x001 /* log standard inode fields */ +#define XFS_ILOG_DDATA 0x002 /* log i_df.if_data */ +#define XFS_ILOG_DEXT 0x004 /* log i_df.if_extents */ +#define XFS_ILOG_DBROOT 0x008 /* log i_df.i_broot */ +#define XFS_ILOG_DEV 0x010 /* log the dev field */ +#define XFS_ILOG_UUID 0x020 /* log the uuid field */ +#define XFS_ILOG_ADATA 0x040 /* log i_af.if_data */ +#define XFS_ILOG_AEXT 0x080 /* log i_af.if_extents */ +#define XFS_ILOG_ABROOT 0x100 /* log i_af.i_broot */ +#define XFS_ILOG_DOWNER 0x200 /* change the data fork owner on replay */ +#define XFS_ILOG_AOWNER 0x400 /* change the attr fork owner on replay */ + + +/* + * The timestamps are dirty, but not necessarily anything else in the inode + * core. Unlike the other fields above this one must never make it to disk + * in the ilf_fields of the inode_log_format, but is purely store in-memory in + * ili_fields in the inode_log_item. + */ +#define XFS_ILOG_TIMESTAMP 0x4000 + +#define XFS_ILOG_NONCORE (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \ + XFS_ILOG_DBROOT | XFS_ILOG_DEV | \ + XFS_ILOG_UUID | XFS_ILOG_ADATA | \ + XFS_ILOG_AEXT | XFS_ILOG_ABROOT | \ + XFS_ILOG_DOWNER | XFS_ILOG_AOWNER) + +#define XFS_ILOG_DFORK (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \ + XFS_ILOG_DBROOT) + +#define XFS_ILOG_AFORK (XFS_ILOG_ADATA | XFS_ILOG_AEXT | \ + XFS_ILOG_ABROOT) + +#define XFS_ILOG_ALL (XFS_ILOG_CORE | XFS_ILOG_DDATA | \ + XFS_ILOG_DEXT | XFS_ILOG_DBROOT | \ + XFS_ILOG_DEV | XFS_ILOG_UUID | \ + XFS_ILOG_ADATA | XFS_ILOG_AEXT | \ + XFS_ILOG_ABROOT | XFS_ILOG_TIMESTAMP | \ + XFS_ILOG_DOWNER | XFS_ILOG_AOWNER) + +static inline int xfs_ilog_fbroot(int w) +{ + return (w == XFS_DATA_FORK ? XFS_ILOG_DBROOT : XFS_ILOG_ABROOT); +} + +static inline int xfs_ilog_fext(int w) +{ + return (w == XFS_DATA_FORK ? XFS_ILOG_DEXT : XFS_ILOG_AEXT); +} + +static inline int xfs_ilog_fdata(int w) +{ + return (w == XFS_DATA_FORK ? XFS_ILOG_DDATA : XFS_ILOG_ADATA); +} + +/* + * Incore version of the on-disk inode core structures. We log this directly + * into the journal in host CPU format (for better or worse) and as such + * directly mirrors the xfs_dinode structure as it must contain all the same + * information. + */ +typedef struct xfs_ictimestamp { + __int32_t t_sec; /* timestamp seconds */ + __int32_t t_nsec; /* timestamp nanoseconds */ +} xfs_ictimestamp_t; + +/* + * NOTE: This structure must be kept identical to struct xfs_dinode + * except for the endianness annotations. + */ +typedef struct xfs_icdinode { + __uint16_t di_magic; /* inode magic # = XFS_DINODE_MAGIC */ + __uint16_t di_mode; /* mode and type of file */ + __int8_t di_version; /* inode version */ + __int8_t di_format; /* format of di_c data */ + __uint16_t di_onlink; /* old number of links to file */ + __uint32_t di_uid; /* owner's user id */ + __uint32_t di_gid; /* owner's group id */ + __uint32_t di_nlink; /* number of links to file */ + __uint16_t di_projid_lo; /* lower part of owner's project id */ + __uint16_t di_projid_hi; /* higher part of owner's project id */ + __uint8_t di_pad[6]; /* unused, zeroed space */ + __uint16_t di_flushiter; /* incremented on flush */ + xfs_ictimestamp_t di_atime; /* time last accessed */ + xfs_ictimestamp_t di_mtime; /* time last modified */ + xfs_ictimestamp_t di_ctime; /* time created/inode modified */ + xfs_fsize_t di_size; /* number of bytes in file */ + xfs_rfsblock_t di_nblocks; /* # of direct & btree blocks used */ + xfs_extlen_t di_extsize; /* basic/minimum extent size for file */ + xfs_extnum_t di_nextents; /* number of extents in data fork */ + xfs_aextnum_t di_anextents; /* number of extents in attribute fork*/ + __uint8_t di_forkoff; /* attr fork offs, <<3 for 64b align */ + __int8_t di_aformat; /* format of attr fork's data */ + __uint32_t di_dmevmask; /* DMIG event mask */ + __uint16_t di_dmstate; /* DMIG state info */ + __uint16_t di_flags; /* random flags, XFS_DIFLAG_... */ + __uint32_t di_gen; /* generation number */ + + /* di_next_unlinked is the only non-core field in the old dinode */ + xfs_agino_t di_next_unlinked;/* agi unlinked list ptr */ + + /* start of the extended dinode, writable fields */ + __uint32_t di_crc; /* CRC of the inode */ + __uint64_t di_changecount; /* number of attribute changes */ + xfs_lsn_t di_lsn; /* flush sequence */ + __uint64_t di_flags2; /* more random flags */ + __uint8_t di_pad2[16]; /* more padding for future expansion */ + + /* fields only written to during inode creation */ + xfs_ictimestamp_t di_crtime; /* time created */ + xfs_ino_t di_ino; /* inode number */ + uuid_t di_uuid; /* UUID of the filesystem */ + + /* structure must be padded to 64 bit alignment */ +} xfs_icdinode_t; + +static inline uint xfs_icdinode_size(int version) +{ + if (version == 3) + return sizeof(struct xfs_icdinode); + return offsetof(struct xfs_icdinode, di_next_unlinked); +} + +/* + * Buffer Log Format defintions + * + * These are the physical dirty bitmap defintions for the log format structure. + */ +#define XFS_BLF_CHUNK 128 +#define XFS_BLF_SHIFT 7 +#define BIT_TO_WORD_SHIFT 5 +#define NBWORD (NBBY * sizeof(unsigned int)) + +/* + * This flag indicates that the buffer contains on disk inodes + * and requires special recovery handling. + */ +#define XFS_BLF_INODE_BUF (1<<0) + +/* + * This flag indicates that the buffer should not be replayed + * during recovery because its blocks are being freed. + */ +#define XFS_BLF_CANCEL (1<<1) + +/* + * This flag indicates that the buffer contains on disk + * user or group dquots and may require special recovery handling. + */ +#define XFS_BLF_UDQUOT_BUF (1<<2) +#define XFS_BLF_PDQUOT_BUF (1<<3) +#define XFS_BLF_GDQUOT_BUF (1<<4) + +/* + * This is the structure used to lay out a buf log item in the + * log. The data map describes which 128 byte chunks of the buffer + * have been logged. + */ +#define XFS_BLF_DATAMAP_SIZE ((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) / NBWORD) + +typedef struct xfs_buf_log_format { + unsigned short blf_type; /* buf log item type indicator */ + unsigned short blf_size; /* size of this item */ + ushort blf_flags; /* misc state */ + ushort blf_len; /* number of blocks in this buf */ + __int64_t blf_blkno; /* starting blkno of this buf */ + unsigned int blf_map_size; /* used size of data bitmap in words */ + unsigned int blf_data_map[XFS_BLF_DATAMAP_SIZE]; /* dirty bitmap */ +} xfs_buf_log_format_t; + +/* + * All buffers now need to tell recovery where the magic number + * is so that it can verify and calculate the CRCs on the buffer correctly + * once the changes have been replayed into the buffer. + * + * The type value is held in the upper 5 bits of the blf_flags field, which is + * an unsigned 16 bit field. Hence we need to shift it 11 bits up and down. + */ +#define XFS_BLFT_BITS 5 +#define XFS_BLFT_SHIFT 11 +#define XFS_BLFT_MASK (((1 << XFS_BLFT_BITS) - 1) << XFS_BLFT_SHIFT) + +enum xfs_blft { + XFS_BLFT_UNKNOWN_BUF = 0, + XFS_BLFT_UDQUOT_BUF, + XFS_BLFT_PDQUOT_BUF, + XFS_BLFT_GDQUOT_BUF, + XFS_BLFT_BTREE_BUF, + XFS_BLFT_AGF_BUF, + XFS_BLFT_AGFL_BUF, + XFS_BLFT_AGI_BUF, + XFS_BLFT_DINO_BUF, + XFS_BLFT_SYMLINK_BUF, + XFS_BLFT_DIR_BLOCK_BUF, + XFS_BLFT_DIR_DATA_BUF, + XFS_BLFT_DIR_FREE_BUF, + XFS_BLFT_DIR_LEAF1_BUF, + XFS_BLFT_DIR_LEAFN_BUF, + XFS_BLFT_DA_NODE_BUF, + XFS_BLFT_ATTR_LEAF_BUF, + XFS_BLFT_ATTR_RMT_BUF, + XFS_BLFT_SB_BUF, + XFS_BLFT_MAX_BUF = (1 << XFS_BLFT_BITS), +}; + +static inline void +xfs_blft_to_flags(struct xfs_buf_log_format *blf, enum xfs_blft type) +{ + ASSERT(type > XFS_BLFT_UNKNOWN_BUF && type < XFS_BLFT_MAX_BUF); + blf->blf_flags &= ~XFS_BLFT_MASK; + blf->blf_flags |= ((type << XFS_BLFT_SHIFT) & XFS_BLFT_MASK); +} + +static inline __uint16_t +xfs_blft_from_flags(struct xfs_buf_log_format *blf) +{ + return (blf->blf_flags & XFS_BLFT_MASK) >> XFS_BLFT_SHIFT; +} + +/* + * EFI/EFD log format definitions + */ +typedef struct xfs_extent { + xfs_fsblock_t ext_start; + xfs_extlen_t ext_len; +} xfs_extent_t; + +/* + * Since an xfs_extent_t has types (start:64, len: 32) + * there are different alignments on 32 bit and 64 bit kernels. + * So we provide the different variants for use by a + * conversion routine. + */ +typedef struct xfs_extent_32 { + __uint64_t ext_start; + __uint32_t ext_len; +} __attribute__((packed)) xfs_extent_32_t; + +typedef struct xfs_extent_64 { + __uint64_t ext_start; + __uint32_t ext_len; + __uint32_t ext_pad; +} xfs_extent_64_t; + +/* + * This is the structure used to lay out an efi log item in the + * log. The efi_extents field is a variable size array whose + * size is given by efi_nextents. + */ +typedef struct xfs_efi_log_format { + __uint16_t efi_type; /* efi log item type */ + __uint16_t efi_size; /* size of this item */ + __uint32_t efi_nextents; /* # extents to free */ + __uint64_t efi_id; /* efi identifier */ + xfs_extent_t efi_extents[1]; /* array of extents to free */ +} xfs_efi_log_format_t; + +typedef struct xfs_efi_log_format_32 { + __uint16_t efi_type; /* efi log item type */ + __uint16_t efi_size; /* size of this item */ + __uint32_t efi_nextents; /* # extents to free */ + __uint64_t efi_id; /* efi identifier */ + xfs_extent_32_t efi_extents[1]; /* array of extents to free */ +} __attribute__((packed)) xfs_efi_log_format_32_t; + +typedef struct xfs_efi_log_format_64 { + __uint16_t efi_type; /* efi log item type */ + __uint16_t efi_size; /* size of this item */ + __uint32_t efi_nextents; /* # extents to free */ + __uint64_t efi_id; /* efi identifier */ + xfs_extent_64_t efi_extents[1]; /* array of extents to free */ +} xfs_efi_log_format_64_t; + +/* + * This is the structure used to lay out an efd log item in the + * log. The efd_extents array is a variable size array whose + * size is given by efd_nextents; + */ +typedef struct xfs_efd_log_format { + __uint16_t efd_type; /* efd log item type */ + __uint16_t efd_size; /* size of this item */ + __uint32_t efd_nextents; /* # of extents freed */ + __uint64_t efd_efi_id; /* id of corresponding efi */ + xfs_extent_t efd_extents[1]; /* array of extents freed */ +} xfs_efd_log_format_t; + +typedef struct xfs_efd_log_format_32 { + __uint16_t efd_type; /* efd log item type */ + __uint16_t efd_size; /* size of this item */ + __uint32_t efd_nextents; /* # of extents freed */ + __uint64_t efd_efi_id; /* id of corresponding efi */ + xfs_extent_32_t efd_extents[1]; /* array of extents freed */ +} __attribute__((packed)) xfs_efd_log_format_32_t; + +typedef struct xfs_efd_log_format_64 { + __uint16_t efd_type; /* efd log item type */ + __uint16_t efd_size; /* size of this item */ + __uint32_t efd_nextents; /* # of extents freed */ + __uint64_t efd_efi_id; /* id of corresponding efi */ + xfs_extent_64_t efd_extents[1]; /* array of extents freed */ +} xfs_efd_log_format_64_t; + +/* + * Dquot Log format definitions. + * + * The first two fields must be the type and size fitting into + * 32 bits : log_recovery code assumes that. + */ +typedef struct xfs_dq_logformat { + __uint16_t qlf_type; /* dquot log item type */ + __uint16_t qlf_size; /* size of this item */ + xfs_dqid_t qlf_id; /* usr/grp/proj id : 32 bits */ + __int64_t qlf_blkno; /* blkno of dquot buffer */ + __int32_t qlf_len; /* len of dquot buffer */ + __uint32_t qlf_boffset; /* off of dquot in buffer */ +} xfs_dq_logformat_t; + +/* + * log format struct for QUOTAOFF records. + * The first two fields must be the type and size fitting into + * 32 bits : log_recovery code assumes that. + * We write two LI_QUOTAOFF logitems per quotaoff, the last one keeps a pointer + * to the first and ensures that the first logitem is taken out of the AIL + * only when the last one is securely committed. + */ +typedef struct xfs_qoff_logformat { + unsigned short qf_type; /* quotaoff log item type */ + unsigned short qf_size; /* size of this item */ + unsigned int qf_flags; /* USR and/or GRP */ + char qf_pad[12]; /* padding for future */ +} xfs_qoff_logformat_t; + +/* + * Disk quotas status in m_qflags, and also sb_qflags. 16 bits. + */ +#define XFS_UQUOTA_ACCT 0x0001 /* user quota accounting ON */ +#define XFS_UQUOTA_ENFD 0x0002 /* user quota limits enforced */ +#define XFS_UQUOTA_CHKD 0x0004 /* quotacheck run on usr quotas */ +#define XFS_PQUOTA_ACCT 0x0008 /* project quota accounting ON */ +#define XFS_OQUOTA_ENFD 0x0010 /* other (grp/prj) quota limits enforced */ +#define XFS_OQUOTA_CHKD 0x0020 /* quotacheck run on other (grp/prj) quotas */ +#define XFS_GQUOTA_ACCT 0x0040 /* group quota accounting ON */ + +/* + * Conversion to and from the combined OQUOTA flag (if necessary) + * is done only in xfs_sb_qflags_to_disk() and xfs_sb_qflags_from_disk() + */ +#define XFS_GQUOTA_ENFD 0x0080 /* group quota limits enforced */ +#define XFS_GQUOTA_CHKD 0x0100 /* quotacheck run on group quotas */ +#define XFS_PQUOTA_ENFD 0x0200 /* project quota limits enforced */ +#define XFS_PQUOTA_CHKD 0x0400 /* quotacheck run on project quotas */ + +#define XFS_ALL_QUOTA_ACCT \ + (XFS_UQUOTA_ACCT | XFS_GQUOTA_ACCT | XFS_PQUOTA_ACCT) +#define XFS_ALL_QUOTA_ENFD \ + (XFS_UQUOTA_ENFD | XFS_GQUOTA_ENFD | XFS_PQUOTA_ENFD) +#define XFS_ALL_QUOTA_CHKD \ + (XFS_UQUOTA_CHKD | XFS_GQUOTA_CHKD | XFS_PQUOTA_CHKD) + +#define XFS_MOUNT_QUOTA_ALL (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\ + XFS_UQUOTA_CHKD|XFS_GQUOTA_ACCT|\ + XFS_GQUOTA_ENFD|XFS_GQUOTA_CHKD|\ + XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD|\ + XFS_PQUOTA_CHKD) + +/* + * Inode create log item structure + * + * Log recovery assumes the first two entries are the type and size and they fit + * in 32 bits. Also in host order (ugh) so they have to be 32 bit aligned so + * decoding can be done correctly. + */ +struct xfs_icreate_log { + __uint16_t icl_type; /* type of log format structure */ + __uint16_t icl_size; /* size of log format structure */ + __be32 icl_ag; /* ag being allocated in */ + __be32 icl_agbno; /* start block of inode range */ + __be32 icl_count; /* number of inodes to initialise */ + __be32 icl_isize; /* size of inodes */ + __be32 icl_length; /* length of extent to initialise */ + __be32 icl_gen; /* inode generation number to use */ +}; + +#endif /* __XFS_LOG_FORMAT_H__ */ diff --git a/kernel/fs/xfs/libxfs/xfs_log_recover.h b/kernel/fs/xfs/libxfs/xfs_log_recover.h new file mode 100644 index 000000000..1c55ccbb3 --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_log_recover.h @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2000,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_LOG_RECOVER_H__ +#define __XFS_LOG_RECOVER_H__ + +/* + * Macros, structures, prototypes for internal log manager use. + */ + +#define XLOG_RHASH_BITS 4 +#define XLOG_RHASH_SIZE 16 +#define XLOG_RHASH_SHIFT 2 +#define XLOG_RHASH(tid) \ + ((((__uint32_t)tid)>>XLOG_RHASH_SHIFT) & (XLOG_RHASH_SIZE-1)) + +#define XLOG_MAX_REGIONS_IN_ITEM (XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK / 2 + 1) + + +/* + * item headers are in ri_buf[0]. Additional buffers follow. + */ +typedef struct xlog_recover_item { + struct list_head ri_list; + int ri_type; + int ri_cnt; /* count of regions found */ + int ri_total; /* total regions */ + xfs_log_iovec_t *ri_buf; /* ptr to regions buffer */ +} xlog_recover_item_t; + +struct xlog_tid; +typedef struct xlog_recover { + struct hlist_node r_list; + xlog_tid_t r_log_tid; /* log's transaction id */ + xfs_trans_header_t r_theader; /* trans header for partial */ + int r_state; /* not needed */ + xfs_lsn_t r_lsn; /* xact lsn */ + struct list_head r_itemq; /* q for items */ +} xlog_recover_t; + +#define ITEM_TYPE(i) (*(ushort *)(i)->ri_buf[0].i_addr) + +/* + * This is the number of entries in the l_buf_cancel_table used during + * recovery. + */ +#define XLOG_BC_TABLE_SIZE 64 + +#define XLOG_RECOVER_PASS1 1 +#define XLOG_RECOVER_PASS2 2 + +#endif /* __XFS_LOG_RECOVER_H__ */ diff --git a/kernel/fs/xfs/libxfs/xfs_log_rlimit.c b/kernel/fs/xfs/libxfs/xfs_log_rlimit.c new file mode 100644 index 000000000..c10597973 --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_log_rlimit.c @@ -0,0 +1,148 @@ +/* + * Copyright (c) 2013 Jie Liu. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_trans_space.h" +#include "xfs_inode.h" +#include "xfs_da_btree.h" +#include "xfs_attr_leaf.h" +#include "xfs_bmap_btree.h" + +/* + * Calculate the maximum length in bytes that would be required for a local + * attribute value as large attributes out of line are not logged. + */ +STATIC int +xfs_log_calc_max_attrsetm_res( + struct xfs_mount *mp) +{ + int size; + int nblks; + + size = xfs_attr_leaf_entsize_local_max(mp->m_attr_geo->blksize) - + MAXNAMELEN - 1; + nblks = XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK); + nblks += XFS_B_TO_FSB(mp, size); + nblks += XFS_NEXTENTADD_SPACE_RES(mp, size, XFS_ATTR_FORK); + + return M_RES(mp)->tr_attrsetm.tr_logres + + M_RES(mp)->tr_attrsetrt.tr_logres * nblks; +} + +/* + * Iterate over the log space reservation table to figure out and return + * the maximum one in terms of the pre-calculated values which were done + * at mount time. + */ +STATIC void +xfs_log_get_max_trans_res( + struct xfs_mount *mp, + struct xfs_trans_res *max_resp) +{ + struct xfs_trans_res *resp; + struct xfs_trans_res *end_resp; + int log_space = 0; + int attr_space; + + attr_space = xfs_log_calc_max_attrsetm_res(mp); + + resp = (struct xfs_trans_res *)M_RES(mp); + end_resp = (struct xfs_trans_res *)(M_RES(mp) + 1); + for (; resp < end_resp; resp++) { + int tmp = resp->tr_logcount > 1 ? + resp->tr_logres * resp->tr_logcount : + resp->tr_logres; + if (log_space < tmp) { + log_space = tmp; + *max_resp = *resp; /* struct copy */ + } + } + + if (attr_space > log_space) { + *max_resp = M_RES(mp)->tr_attrsetm; /* struct copy */ + max_resp->tr_logres = attr_space; + } +} + +/* + * Calculate the minimum valid log size for the given superblock configuration. + * Used to calculate the minimum log size at mkfs time, and to determine if + * the log is large enough or not at mount time. Returns the minimum size in + * filesystem block size units. + */ +int +xfs_log_calc_minimum_size( + struct xfs_mount *mp) +{ + struct xfs_trans_res tres = {0}; + int max_logres; + int min_logblks = 0; + int lsunit = 0; + + xfs_log_get_max_trans_res(mp, &tres); + + max_logres = xfs_log_calc_unit_res(mp, tres.tr_logres); + if (tres.tr_logcount > 1) + max_logres *= tres.tr_logcount; + + if (xfs_sb_version_haslogv2(&mp->m_sb) && mp->m_sb.sb_logsunit > 1) + lsunit = BTOBB(mp->m_sb.sb_logsunit); + + /* + * Two factors should be taken into account for calculating the minimum + * log space. + * 1) The fundamental limitation is that no single transaction can be + * larger than half size of the log. + * + * From mkfs.xfs, this is considered by the XFS_MIN_LOG_FACTOR + * define, which is set to 3. That means we can definitely fit + * maximally sized 2 transactions in the log. We'll use this same + * value here. + * + * 2) If the lsunit option is specified, a transaction requires 2 LSU + * for the reservation because there are two log writes that can + * require padding - the transaction data and the commit record which + * are written separately and both can require padding to the LSU. + * Consider that we can have an active CIL reservation holding 2*LSU, + * but the CIL is not over a push threshold, in this case, if we + * don't have enough log space for at one new transaction, which + * includes another 2*LSU in the reservation, we will run into dead + * loop situation in log space grant procedure. i.e. + * xlog_grant_head_wait(). + * + * Hence the log size needs to be able to contain two maximally sized + * and padded transactions, which is (2 * (2 * LSU + maxlres)). + * + * Also, the log size should be a multiple of the log stripe unit, round + * it up to lsunit boundary if lsunit is specified. + */ + if (lsunit) { + min_logblks = roundup_64(BTOBB(max_logres), lsunit) + + 2 * lsunit; + } else + min_logblks = BTOBB(max_logres) + 2 * BBSIZE; + min_logblks *= XFS_MIN_LOG_FACTOR; + + return XFS_BB_TO_FSB(mp, min_logblks); +} diff --git a/kernel/fs/xfs/libxfs/xfs_quota_defs.h b/kernel/fs/xfs/libxfs/xfs_quota_defs.h new file mode 100644 index 000000000..1b0a08379 --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_quota_defs.h @@ -0,0 +1,159 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_QUOTA_DEFS_H__ +#define __XFS_QUOTA_DEFS_H__ + +/* + * Quota definitions shared between user and kernel source trees. + */ + +/* + * Even though users may not have quota limits occupying all 64-bits, + * they may need 64-bit accounting. Hence, 64-bit quota-counters, + * and quota-limits. This is a waste in the common case, but hey ... + */ +typedef __uint64_t xfs_qcnt_t; +typedef __uint16_t xfs_qwarncnt_t; + +/* + * flags for q_flags field in the dquot. + */ +#define XFS_DQ_USER 0x0001 /* a user quota */ +#define XFS_DQ_PROJ 0x0002 /* project quota */ +#define XFS_DQ_GROUP 0x0004 /* a group quota */ +#define XFS_DQ_DIRTY 0x0008 /* dquot is dirty */ +#define XFS_DQ_FREEING 0x0010 /* dquot is beeing torn down */ + +#define XFS_DQ_ALLTYPES (XFS_DQ_USER|XFS_DQ_PROJ|XFS_DQ_GROUP) + +#define XFS_DQ_FLAGS \ + { XFS_DQ_USER, "USER" }, \ + { XFS_DQ_PROJ, "PROJ" }, \ + { XFS_DQ_GROUP, "GROUP" }, \ + { XFS_DQ_DIRTY, "DIRTY" }, \ + { XFS_DQ_FREEING, "FREEING" } + +/* + * We have the possibility of all three quota types being active at once, and + * hence free space modification requires modification of all three current + * dquots in a single transaction. For this case we need to have a reservation + * of at least 3 dquots. + * + * However, a chmod operation can change both UID and GID in a single + * transaction, resulting in requiring {old, new} x {uid, gid} dquots to be + * modified. Hence for this case we need to reserve space for at least 4 dquots. + * + * And in the worst case, there's a rename operation that can be modifying up to + * 4 inodes with dquots attached to them. In reality, the only inodes that can + * have their dquots modified are the source and destination directory inodes + * due to directory name creation and removal. That can require space allocation + * and/or freeing on both directory inodes, and hence all three dquots on each + * inode can be modified. And if the directories are world writeable, all the + * dquots can be unique and so 6 dquots can be modified.... + * + * And, of course, we also need to take into account the dquot log format item + * used to describe each dquot. + */ +#define XFS_DQUOT_LOGRES(mp) \ + ((sizeof(struct xfs_dq_logformat) + sizeof(struct xfs_disk_dquot)) * 6) + +#define XFS_IS_QUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_ALL_QUOTA_ACCT) +#define XFS_IS_UQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_UQUOTA_ACCT) +#define XFS_IS_PQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_PQUOTA_ACCT) +#define XFS_IS_GQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_GQUOTA_ACCT) +#define XFS_IS_UQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_UQUOTA_ENFD) +#define XFS_IS_GQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_GQUOTA_ENFD) +#define XFS_IS_PQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_PQUOTA_ENFD) + +/* + * Incore only flags for quotaoff - these bits get cleared when quota(s) + * are in the process of getting turned off. These flags are in m_qflags but + * never in sb_qflags. + */ +#define XFS_UQUOTA_ACTIVE 0x1000 /* uquotas are being turned off */ +#define XFS_GQUOTA_ACTIVE 0x2000 /* gquotas are being turned off */ +#define XFS_PQUOTA_ACTIVE 0x4000 /* pquotas are being turned off */ +#define XFS_ALL_QUOTA_ACTIVE \ + (XFS_UQUOTA_ACTIVE | XFS_GQUOTA_ACTIVE | XFS_PQUOTA_ACTIVE) + +/* + * Checking XFS_IS_*QUOTA_ON() while holding any inode lock guarantees + * quota will be not be switched off as long as that inode lock is held. + */ +#define XFS_IS_QUOTA_ON(mp) ((mp)->m_qflags & (XFS_UQUOTA_ACTIVE | \ + XFS_GQUOTA_ACTIVE | \ + XFS_PQUOTA_ACTIVE)) +#define XFS_IS_UQUOTA_ON(mp) ((mp)->m_qflags & XFS_UQUOTA_ACTIVE) +#define XFS_IS_GQUOTA_ON(mp) ((mp)->m_qflags & XFS_GQUOTA_ACTIVE) +#define XFS_IS_PQUOTA_ON(mp) ((mp)->m_qflags & XFS_PQUOTA_ACTIVE) + +/* + * Flags to tell various functions what to do. Not all of these are meaningful + * to a single function. None of these XFS_QMOPT_* flags are meant to have + * persistent values (ie. their values can and will change between versions) + */ +#define XFS_QMOPT_DQALLOC 0x0000002 /* alloc dquot ondisk if needed */ +#define XFS_QMOPT_UQUOTA 0x0000004 /* user dquot requested */ +#define XFS_QMOPT_PQUOTA 0x0000008 /* project dquot requested */ +#define XFS_QMOPT_FORCE_RES 0x0000010 /* ignore quota limits */ +#define XFS_QMOPT_SBVERSION 0x0000040 /* change superblock version num */ +#define XFS_QMOPT_DOWARN 0x0000400 /* increase warning cnt if needed */ +#define XFS_QMOPT_DQREPAIR 0x0001000 /* repair dquot if damaged */ +#define XFS_QMOPT_GQUOTA 0x0002000 /* group dquot requested */ +#define XFS_QMOPT_ENOSPC 0x0004000 /* enospc instead of edquot (prj) */ + +/* + * flags to xfs_trans_mod_dquot to indicate which field needs to be + * modified. + */ +#define XFS_QMOPT_RES_REGBLKS 0x0010000 +#define XFS_QMOPT_RES_RTBLKS 0x0020000 +#define XFS_QMOPT_BCOUNT 0x0040000 +#define XFS_QMOPT_ICOUNT 0x0080000 +#define XFS_QMOPT_RTBCOUNT 0x0100000 +#define XFS_QMOPT_DELBCOUNT 0x0200000 +#define XFS_QMOPT_DELRTBCOUNT 0x0400000 +#define XFS_QMOPT_RES_INOS 0x0800000 + +/* + * flags for dqalloc. + */ +#define XFS_QMOPT_INHERIT 0x1000000 + +/* + * flags to xfs_trans_mod_dquot. + */ +#define XFS_TRANS_DQ_RES_BLKS XFS_QMOPT_RES_REGBLKS +#define XFS_TRANS_DQ_RES_RTBLKS XFS_QMOPT_RES_RTBLKS +#define XFS_TRANS_DQ_RES_INOS XFS_QMOPT_RES_INOS +#define XFS_TRANS_DQ_BCOUNT XFS_QMOPT_BCOUNT +#define XFS_TRANS_DQ_DELBCOUNT XFS_QMOPT_DELBCOUNT +#define XFS_TRANS_DQ_ICOUNT XFS_QMOPT_ICOUNT +#define XFS_TRANS_DQ_RTBCOUNT XFS_QMOPT_RTBCOUNT +#define XFS_TRANS_DQ_DELRTBCOUNT XFS_QMOPT_DELRTBCOUNT + + +#define XFS_QMOPT_QUOTALL \ + (XFS_QMOPT_UQUOTA | XFS_QMOPT_PQUOTA | XFS_QMOPT_GQUOTA) +#define XFS_QMOPT_RESBLK_MASK (XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_RES_RTBLKS) + +extern int xfs_dqcheck(struct xfs_mount *mp, xfs_disk_dquot_t *ddq, + xfs_dqid_t id, uint type, uint flags, char *str); +extern int xfs_calc_dquots_per_chunk(unsigned int nbblks); + +#endif /* __XFS_QUOTA_H__ */ diff --git a/kernel/fs/xfs/libxfs/xfs_rtbitmap.c b/kernel/fs/xfs/libxfs/xfs_rtbitmap.c new file mode 100644 index 000000000..9b59ffa1f --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_rtbitmap.c @@ -0,0 +1,991 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_bmap.h" +#include "xfs_bmap_util.h" +#include "xfs_bmap_btree.h" +#include "xfs_alloc.h" +#include "xfs_error.h" +#include "xfs_trans.h" +#include "xfs_trans_space.h" +#include "xfs_trace.h" +#include "xfs_buf.h" +#include "xfs_icache.h" +#include "xfs_rtalloc.h" + + +/* + * Realtime allocator bitmap functions shared with userspace. + */ + +/* + * Get a buffer for the bitmap or summary file block specified. + * The buffer is returned read and locked. + */ +int +xfs_rtbuf_get( + xfs_mount_t *mp, /* file system mount structure */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_rtblock_t block, /* block number in bitmap or summary */ + int issum, /* is summary not bitmap */ + xfs_buf_t **bpp) /* output: buffer for the block */ +{ + xfs_buf_t *bp; /* block buffer, result */ + xfs_inode_t *ip; /* bitmap or summary inode */ + xfs_bmbt_irec_t map; + int nmap = 1; + int error; /* error value */ + + ip = issum ? mp->m_rsumip : mp->m_rbmip; + + error = xfs_bmapi_read(ip, block, 1, &map, &nmap, XFS_DATA_FORK); + if (error) + return error; + + ASSERT(map.br_startblock != NULLFSBLOCK); + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, + XFS_FSB_TO_DADDR(mp, map.br_startblock), + mp->m_bsize, 0, &bp, NULL); + if (error) + return error; + *bpp = bp; + return 0; +} + +/* + * Searching backward from start to limit, find the first block whose + * allocated/free state is different from start's. + */ +int +xfs_rtfind_back( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_rtblock_t start, /* starting block to look at */ + xfs_rtblock_t limit, /* last block to look at */ + xfs_rtblock_t *rtblock) /* out: start block found */ +{ + xfs_rtword_t *b; /* current word in buffer */ + int bit; /* bit number in the word */ + xfs_rtblock_t block; /* bitmap block number */ + xfs_buf_t *bp; /* buf for the block */ + xfs_rtword_t *bufp; /* starting word in buffer */ + int error; /* error value */ + xfs_rtblock_t firstbit; /* first useful bit in the word */ + xfs_rtblock_t i; /* current bit number rel. to start */ + xfs_rtblock_t len; /* length of inspected area */ + xfs_rtword_t mask; /* mask of relevant bits for value */ + xfs_rtword_t want; /* mask for "good" values */ + xfs_rtword_t wdiff; /* difference from wanted value */ + int word; /* word number in the buffer */ + + /* + * Compute and read in starting bitmap block for starting block. + */ + block = XFS_BITTOBLOCK(mp, start); + error = xfs_rtbuf_get(mp, tp, block, 0, &bp); + if (error) { + return error; + } + bufp = bp->b_addr; + /* + * Get the first word's index & point to it. + */ + word = XFS_BITTOWORD(mp, start); + b = &bufp[word]; + bit = (int)(start & (XFS_NBWORD - 1)); + len = start - limit + 1; + /* + * Compute match value, based on the bit at start: if 1 (free) + * then all-ones, else all-zeroes. + */ + want = (*b & ((xfs_rtword_t)1 << bit)) ? -1 : 0; + /* + * If the starting position is not word-aligned, deal with the + * partial word. + */ + if (bit < XFS_NBWORD - 1) { + /* + * Calculate first (leftmost) bit number to look at, + * and mask for all the relevant bits in this word. + */ + firstbit = XFS_RTMAX((xfs_srtblock_t)(bit - len + 1), 0); + mask = (((xfs_rtword_t)1 << (bit - firstbit + 1)) - 1) << + firstbit; + /* + * Calculate the difference between the value there + * and what we're looking for. + */ + if ((wdiff = (*b ^ want) & mask)) { + /* + * Different. Mark where we are and return. + */ + xfs_trans_brelse(tp, bp); + i = bit - XFS_RTHIBIT(wdiff); + *rtblock = start - i + 1; + return 0; + } + i = bit - firstbit + 1; + /* + * Go on to previous block if that's where the previous word is + * and we need the previous word. + */ + if (--word == -1 && i < len) { + /* + * If done with this block, get the previous one. + */ + xfs_trans_brelse(tp, bp); + error = xfs_rtbuf_get(mp, tp, --block, 0, &bp); + if (error) { + return error; + } + bufp = bp->b_addr; + word = XFS_BLOCKWMASK(mp); + b = &bufp[word]; + } else { + /* + * Go on to the previous word in the buffer. + */ + b--; + } + } else { + /* + * Starting on a word boundary, no partial word. + */ + i = 0; + } + /* + * Loop over whole words in buffers. When we use up one buffer + * we move on to the previous one. + */ + while (len - i >= XFS_NBWORD) { + /* + * Compute difference between actual and desired value. + */ + if ((wdiff = *b ^ want)) { + /* + * Different, mark where we are and return. + */ + xfs_trans_brelse(tp, bp); + i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff); + *rtblock = start - i + 1; + return 0; + } + i += XFS_NBWORD; + /* + * Go on to previous block if that's where the previous word is + * and we need the previous word. + */ + if (--word == -1 && i < len) { + /* + * If done with this block, get the previous one. + */ + xfs_trans_brelse(tp, bp); + error = xfs_rtbuf_get(mp, tp, --block, 0, &bp); + if (error) { + return error; + } + bufp = bp->b_addr; + word = XFS_BLOCKWMASK(mp); + b = &bufp[word]; + } else { + /* + * Go on to the previous word in the buffer. + */ + b--; + } + } + /* + * If not ending on a word boundary, deal with the last + * (partial) word. + */ + if (len - i) { + /* + * Calculate first (leftmost) bit number to look at, + * and mask for all the relevant bits in this word. + */ + firstbit = XFS_NBWORD - (len - i); + mask = (((xfs_rtword_t)1 << (len - i)) - 1) << firstbit; + /* + * Compute difference between actual and desired value. + */ + if ((wdiff = (*b ^ want) & mask)) { + /* + * Different, mark where we are and return. + */ + xfs_trans_brelse(tp, bp); + i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff); + *rtblock = start - i + 1; + return 0; + } else + i = len; + } + /* + * No match, return that we scanned the whole area. + */ + xfs_trans_brelse(tp, bp); + *rtblock = start - i + 1; + return 0; +} + +/* + * Searching forward from start to limit, find the first block whose + * allocated/free state is different from start's. + */ +int +xfs_rtfind_forw( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_rtblock_t start, /* starting block to look at */ + xfs_rtblock_t limit, /* last block to look at */ + xfs_rtblock_t *rtblock) /* out: start block found */ +{ + xfs_rtword_t *b; /* current word in buffer */ + int bit; /* bit number in the word */ + xfs_rtblock_t block; /* bitmap block number */ + xfs_buf_t *bp; /* buf for the block */ + xfs_rtword_t *bufp; /* starting word in buffer */ + int error; /* error value */ + xfs_rtblock_t i; /* current bit number rel. to start */ + xfs_rtblock_t lastbit; /* last useful bit in the word */ + xfs_rtblock_t len; /* length of inspected area */ + xfs_rtword_t mask; /* mask of relevant bits for value */ + xfs_rtword_t want; /* mask for "good" values */ + xfs_rtword_t wdiff; /* difference from wanted value */ + int word; /* word number in the buffer */ + + /* + * Compute and read in starting bitmap block for starting block. + */ + block = XFS_BITTOBLOCK(mp, start); + error = xfs_rtbuf_get(mp, tp, block, 0, &bp); + if (error) { + return error; + } + bufp = bp->b_addr; + /* + * Get the first word's index & point to it. + */ + word = XFS_BITTOWORD(mp, start); + b = &bufp[word]; + bit = (int)(start & (XFS_NBWORD - 1)); + len = limit - start + 1; + /* + * Compute match value, based on the bit at start: if 1 (free) + * then all-ones, else all-zeroes. + */ + want = (*b & ((xfs_rtword_t)1 << bit)) ? -1 : 0; + /* + * If the starting position is not word-aligned, deal with the + * partial word. + */ + if (bit) { + /* + * Calculate last (rightmost) bit number to look at, + * and mask for all the relevant bits in this word. + */ + lastbit = XFS_RTMIN(bit + len, XFS_NBWORD); + mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit; + /* + * Calculate the difference between the value there + * and what we're looking for. + */ + if ((wdiff = (*b ^ want) & mask)) { + /* + * Different. Mark where we are and return. + */ + xfs_trans_brelse(tp, bp); + i = XFS_RTLOBIT(wdiff) - bit; + *rtblock = start + i - 1; + return 0; + } + i = lastbit - bit; + /* + * Go on to next block if that's where the next word is + * and we need the next word. + */ + if (++word == XFS_BLOCKWSIZE(mp) && i < len) { + /* + * If done with this block, get the previous one. + */ + xfs_trans_brelse(tp, bp); + error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp); + if (error) { + return error; + } + b = bufp = bp->b_addr; + word = 0; + } else { + /* + * Go on to the previous word in the buffer. + */ + b++; + } + } else { + /* + * Starting on a word boundary, no partial word. + */ + i = 0; + } + /* + * Loop over whole words in buffers. When we use up one buffer + * we move on to the next one. + */ + while (len - i >= XFS_NBWORD) { + /* + * Compute difference between actual and desired value. + */ + if ((wdiff = *b ^ want)) { + /* + * Different, mark where we are and return. + */ + xfs_trans_brelse(tp, bp); + i += XFS_RTLOBIT(wdiff); + *rtblock = start + i - 1; + return 0; + } + i += XFS_NBWORD; + /* + * Go on to next block if that's where the next word is + * and we need the next word. + */ + if (++word == XFS_BLOCKWSIZE(mp) && i < len) { + /* + * If done with this block, get the next one. + */ + xfs_trans_brelse(tp, bp); + error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp); + if (error) { + return error; + } + b = bufp = bp->b_addr; + word = 0; + } else { + /* + * Go on to the next word in the buffer. + */ + b++; + } + } + /* + * If not ending on a word boundary, deal with the last + * (partial) word. + */ + if ((lastbit = len - i)) { + /* + * Calculate mask for all the relevant bits in this word. + */ + mask = ((xfs_rtword_t)1 << lastbit) - 1; + /* + * Compute difference between actual and desired value. + */ + if ((wdiff = (*b ^ want) & mask)) { + /* + * Different, mark where we are and return. + */ + xfs_trans_brelse(tp, bp); + i += XFS_RTLOBIT(wdiff); + *rtblock = start + i - 1; + return 0; + } else + i = len; + } + /* + * No match, return that we scanned the whole area. + */ + xfs_trans_brelse(tp, bp); + *rtblock = start + i - 1; + return 0; +} + +/* + * Read and/or modify the summary information for a given extent size, + * bitmap block combination. + * Keeps track of a current summary block, so we don't keep reading + * it from the buffer cache. + * + * Summary information is returned in *sum if specified. + * If no delta is specified, returns summary only. + */ +int +xfs_rtmodify_summary_int( + xfs_mount_t *mp, /* file system mount structure */ + xfs_trans_t *tp, /* transaction pointer */ + int log, /* log2 of extent size */ + xfs_rtblock_t bbno, /* bitmap block number */ + int delta, /* change to make to summary info */ + xfs_buf_t **rbpp, /* in/out: summary block buffer */ + xfs_fsblock_t *rsb, /* in/out: summary block number */ + xfs_suminfo_t *sum) /* out: summary info for this block */ +{ + xfs_buf_t *bp; /* buffer for the summary block */ + int error; /* error value */ + xfs_fsblock_t sb; /* summary fsblock */ + int so; /* index into the summary file */ + xfs_suminfo_t *sp; /* pointer to returned data */ + + /* + * Compute entry number in the summary file. + */ + so = XFS_SUMOFFS(mp, log, bbno); + /* + * Compute the block number in the summary file. + */ + sb = XFS_SUMOFFSTOBLOCK(mp, so); + /* + * If we have an old buffer, and the block number matches, use that. + */ + if (*rbpp && *rsb == sb) + bp = *rbpp; + /* + * Otherwise we have to get the buffer. + */ + else { + /* + * If there was an old one, get rid of it first. + */ + if (*rbpp) + xfs_trans_brelse(tp, *rbpp); + error = xfs_rtbuf_get(mp, tp, sb, 1, &bp); + if (error) { + return error; + } + /* + * Remember this buffer and block for the next call. + */ + *rbpp = bp; + *rsb = sb; + } + /* + * Point to the summary information, modify/log it, and/or copy it out. + */ + sp = XFS_SUMPTR(mp, bp, so); + if (delta) { + uint first = (uint)((char *)sp - (char *)bp->b_addr); + + *sp += delta; + xfs_trans_log_buf(tp, bp, first, first + sizeof(*sp) - 1); + } + if (sum) + *sum = *sp; + return 0; +} + +int +xfs_rtmodify_summary( + xfs_mount_t *mp, /* file system mount structure */ + xfs_trans_t *tp, /* transaction pointer */ + int log, /* log2 of extent size */ + xfs_rtblock_t bbno, /* bitmap block number */ + int delta, /* change to make to summary info */ + xfs_buf_t **rbpp, /* in/out: summary block buffer */ + xfs_fsblock_t *rsb) /* in/out: summary block number */ +{ + return xfs_rtmodify_summary_int(mp, tp, log, bbno, + delta, rbpp, rsb, NULL); +} + +/* + * Set the given range of bitmap bits to the given value. + * Do whatever I/O and logging is required. + */ +int +xfs_rtmodify_range( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_rtblock_t start, /* starting block to modify */ + xfs_extlen_t len, /* length of extent to modify */ + int val) /* 1 for free, 0 for allocated */ +{ + xfs_rtword_t *b; /* current word in buffer */ + int bit; /* bit number in the word */ + xfs_rtblock_t block; /* bitmap block number */ + xfs_buf_t *bp; /* buf for the block */ + xfs_rtword_t *bufp; /* starting word in buffer */ + int error; /* error value */ + xfs_rtword_t *first; /* first used word in the buffer */ + int i; /* current bit number rel. to start */ + int lastbit; /* last useful bit in word */ + xfs_rtword_t mask; /* mask o frelevant bits for value */ + int word; /* word number in the buffer */ + + /* + * Compute starting bitmap block number. + */ + block = XFS_BITTOBLOCK(mp, start); + /* + * Read the bitmap block, and point to its data. + */ + error = xfs_rtbuf_get(mp, tp, block, 0, &bp); + if (error) { + return error; + } + bufp = bp->b_addr; + /* + * Compute the starting word's address, and starting bit. + */ + word = XFS_BITTOWORD(mp, start); + first = b = &bufp[word]; + bit = (int)(start & (XFS_NBWORD - 1)); + /* + * 0 (allocated) => all zeroes; 1 (free) => all ones. + */ + val = -val; + /* + * If not starting on a word boundary, deal with the first + * (partial) word. + */ + if (bit) { + /* + * Compute first bit not changed and mask of relevant bits. + */ + lastbit = XFS_RTMIN(bit + len, XFS_NBWORD); + mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit; + /* + * Set/clear the active bits. + */ + if (val) + *b |= mask; + else + *b &= ~mask; + i = lastbit - bit; + /* + * Go on to the next block if that's where the next word is + * and we need the next word. + */ + if (++word == XFS_BLOCKWSIZE(mp) && i < len) { + /* + * Log the changed part of this block. + * Get the next one. + */ + xfs_trans_log_buf(tp, bp, + (uint)((char *)first - (char *)bufp), + (uint)((char *)b - (char *)bufp)); + error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp); + if (error) { + return error; + } + first = b = bufp = bp->b_addr; + word = 0; + } else { + /* + * Go on to the next word in the buffer + */ + b++; + } + } else { + /* + * Starting on a word boundary, no partial word. + */ + i = 0; + } + /* + * Loop over whole words in buffers. When we use up one buffer + * we move on to the next one. + */ + while (len - i >= XFS_NBWORD) { + /* + * Set the word value correctly. + */ + *b = val; + i += XFS_NBWORD; + /* + * Go on to the next block if that's where the next word is + * and we need the next word. + */ + if (++word == XFS_BLOCKWSIZE(mp) && i < len) { + /* + * Log the changed part of this block. + * Get the next one. + */ + xfs_trans_log_buf(tp, bp, + (uint)((char *)first - (char *)bufp), + (uint)((char *)b - (char *)bufp)); + error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp); + if (error) { + return error; + } + first = b = bufp = bp->b_addr; + word = 0; + } else { + /* + * Go on to the next word in the buffer + */ + b++; + } + } + /* + * If not ending on a word boundary, deal with the last + * (partial) word. + */ + if ((lastbit = len - i)) { + /* + * Compute a mask of relevant bits. + */ + bit = 0; + mask = ((xfs_rtword_t)1 << lastbit) - 1; + /* + * Set/clear the active bits. + */ + if (val) + *b |= mask; + else + *b &= ~mask; + b++; + } + /* + * Log any remaining changed bytes. + */ + if (b > first) + xfs_trans_log_buf(tp, bp, (uint)((char *)first - (char *)bufp), + (uint)((char *)b - (char *)bufp - 1)); + return 0; +} + +/* + * Mark an extent specified by start and len freed. + * Updates all the summary information as well as the bitmap. + */ +int +xfs_rtfree_range( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_rtblock_t start, /* starting block to free */ + xfs_extlen_t len, /* length to free */ + xfs_buf_t **rbpp, /* in/out: summary block buffer */ + xfs_fsblock_t *rsb) /* in/out: summary block number */ +{ + xfs_rtblock_t end; /* end of the freed extent */ + int error; /* error value */ + xfs_rtblock_t postblock; /* first block freed > end */ + xfs_rtblock_t preblock; /* first block freed < start */ + + end = start + len - 1; + /* + * Modify the bitmap to mark this extent freed. + */ + error = xfs_rtmodify_range(mp, tp, start, len, 1); + if (error) { + return error; + } + /* + * Assume we're freeing out of the middle of an allocated extent. + * We need to find the beginning and end of the extent so we can + * properly update the summary. + */ + error = xfs_rtfind_back(mp, tp, start, 0, &preblock); + if (error) { + return error; + } + /* + * Find the next allocated block (end of allocated extent). + */ + error = xfs_rtfind_forw(mp, tp, end, mp->m_sb.sb_rextents - 1, + &postblock); + if (error) + return error; + /* + * If there are blocks not being freed at the front of the + * old extent, add summary data for them to be allocated. + */ + if (preblock < start) { + error = xfs_rtmodify_summary(mp, tp, + XFS_RTBLOCKLOG(start - preblock), + XFS_BITTOBLOCK(mp, preblock), -1, rbpp, rsb); + if (error) { + return error; + } + } + /* + * If there are blocks not being freed at the end of the + * old extent, add summary data for them to be allocated. + */ + if (postblock > end) { + error = xfs_rtmodify_summary(mp, tp, + XFS_RTBLOCKLOG(postblock - end), + XFS_BITTOBLOCK(mp, end + 1), -1, rbpp, rsb); + if (error) { + return error; + } + } + /* + * Increment the summary information corresponding to the entire + * (new) free extent. + */ + error = xfs_rtmodify_summary(mp, tp, + XFS_RTBLOCKLOG(postblock + 1 - preblock), + XFS_BITTOBLOCK(mp, preblock), 1, rbpp, rsb); + return error; +} + +/* + * Check that the given range is either all allocated (val = 0) or + * all free (val = 1). + */ +int +xfs_rtcheck_range( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_rtblock_t start, /* starting block number of extent */ + xfs_extlen_t len, /* length of extent */ + int val, /* 1 for free, 0 for allocated */ + xfs_rtblock_t *new, /* out: first block not matching */ + int *stat) /* out: 1 for matches, 0 for not */ +{ + xfs_rtword_t *b; /* current word in buffer */ + int bit; /* bit number in the word */ + xfs_rtblock_t block; /* bitmap block number */ + xfs_buf_t *bp; /* buf for the block */ + xfs_rtword_t *bufp; /* starting word in buffer */ + int error; /* error value */ + xfs_rtblock_t i; /* current bit number rel. to start */ + xfs_rtblock_t lastbit; /* last useful bit in word */ + xfs_rtword_t mask; /* mask of relevant bits for value */ + xfs_rtword_t wdiff; /* difference from wanted value */ + int word; /* word number in the buffer */ + + /* + * Compute starting bitmap block number + */ + block = XFS_BITTOBLOCK(mp, start); + /* + * Read the bitmap block. + */ + error = xfs_rtbuf_get(mp, tp, block, 0, &bp); + if (error) { + return error; + } + bufp = bp->b_addr; + /* + * Compute the starting word's address, and starting bit. + */ + word = XFS_BITTOWORD(mp, start); + b = &bufp[word]; + bit = (int)(start & (XFS_NBWORD - 1)); + /* + * 0 (allocated) => all zero's; 1 (free) => all one's. + */ + val = -val; + /* + * If not starting on a word boundary, deal with the first + * (partial) word. + */ + if (bit) { + /* + * Compute first bit not examined. + */ + lastbit = XFS_RTMIN(bit + len, XFS_NBWORD); + /* + * Mask of relevant bits. + */ + mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit; + /* + * Compute difference between actual and desired value. + */ + if ((wdiff = (*b ^ val) & mask)) { + /* + * Different, compute first wrong bit and return. + */ + xfs_trans_brelse(tp, bp); + i = XFS_RTLOBIT(wdiff) - bit; + *new = start + i; + *stat = 0; + return 0; + } + i = lastbit - bit; + /* + * Go on to next block if that's where the next word is + * and we need the next word. + */ + if (++word == XFS_BLOCKWSIZE(mp) && i < len) { + /* + * If done with this block, get the next one. + */ + xfs_trans_brelse(tp, bp); + error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp); + if (error) { + return error; + } + b = bufp = bp->b_addr; + word = 0; + } else { + /* + * Go on to the next word in the buffer. + */ + b++; + } + } else { + /* + * Starting on a word boundary, no partial word. + */ + i = 0; + } + /* + * Loop over whole words in buffers. When we use up one buffer + * we move on to the next one. + */ + while (len - i >= XFS_NBWORD) { + /* + * Compute difference between actual and desired value. + */ + if ((wdiff = *b ^ val)) { + /* + * Different, compute first wrong bit and return. + */ + xfs_trans_brelse(tp, bp); + i += XFS_RTLOBIT(wdiff); + *new = start + i; + *stat = 0; + return 0; + } + i += XFS_NBWORD; + /* + * Go on to next block if that's where the next word is + * and we need the next word. + */ + if (++word == XFS_BLOCKWSIZE(mp) && i < len) { + /* + * If done with this block, get the next one. + */ + xfs_trans_brelse(tp, bp); + error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp); + if (error) { + return error; + } + b = bufp = bp->b_addr; + word = 0; + } else { + /* + * Go on to the next word in the buffer. + */ + b++; + } + } + /* + * If not ending on a word boundary, deal with the last + * (partial) word. + */ + if ((lastbit = len - i)) { + /* + * Mask of relevant bits. + */ + mask = ((xfs_rtword_t)1 << lastbit) - 1; + /* + * Compute difference between actual and desired value. + */ + if ((wdiff = (*b ^ val) & mask)) { + /* + * Different, compute first wrong bit and return. + */ + xfs_trans_brelse(tp, bp); + i += XFS_RTLOBIT(wdiff); + *new = start + i; + *stat = 0; + return 0; + } else + i = len; + } + /* + * Successful, return. + */ + xfs_trans_brelse(tp, bp); + *new = start + i; + *stat = 1; + return 0; +} + +#ifdef DEBUG +/* + * Check that the given extent (block range) is allocated already. + */ +STATIC int /* error */ +xfs_rtcheck_alloc_range( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_rtblock_t bno, /* starting block number of extent */ + xfs_extlen_t len) /* length of extent */ +{ + xfs_rtblock_t new; /* dummy for xfs_rtcheck_range */ + int stat; + int error; + + error = xfs_rtcheck_range(mp, tp, bno, len, 0, &new, &stat); + if (error) + return error; + ASSERT(stat); + return 0; +} +#else +#define xfs_rtcheck_alloc_range(m,t,b,l) (0) +#endif +/* + * Free an extent in the realtime subvolume. Length is expressed in + * realtime extents, as is the block number. + */ +int /* error */ +xfs_rtfree_extent( + xfs_trans_t *tp, /* transaction pointer */ + xfs_rtblock_t bno, /* starting block number to free */ + xfs_extlen_t len) /* length of extent freed */ +{ + int error; /* error value */ + xfs_mount_t *mp; /* file system mount structure */ + xfs_fsblock_t sb; /* summary file block number */ + xfs_buf_t *sumbp = NULL; /* summary file block buffer */ + + mp = tp->t_mountp; + + ASSERT(mp->m_rbmip->i_itemp != NULL); + ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL)); + + error = xfs_rtcheck_alloc_range(mp, tp, bno, len); + if (error) + return error; + + /* + * Free the range of realtime blocks. + */ + error = xfs_rtfree_range(mp, tp, bno, len, &sumbp, &sb); + if (error) { + return error; + } + /* + * Mark more blocks free in the superblock. + */ + xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, (long)len); + /* + * If we've now freed all the blocks, reset the file sequence + * number to 0. + */ + if (tp->t_frextents_delta + mp->m_sb.sb_frextents == + mp->m_sb.sb_rextents) { + if (!(mp->m_rbmip->i_d.di_flags & XFS_DIFLAG_NEWRTBM)) + mp->m_rbmip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM; + *(__uint64_t *)&mp->m_rbmip->i_d.di_atime = 0; + xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE); + } + return 0; +} + diff --git a/kernel/fs/xfs/libxfs/xfs_sb.c b/kernel/fs/xfs/libxfs/xfs_sb.c new file mode 100644 index 000000000..dc4bfc5d8 --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_sb.c @@ -0,0 +1,803 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_ialloc.h" +#include "xfs_alloc.h" +#include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_cksum.h" +#include "xfs_trans.h" +#include "xfs_buf_item.h" +#include "xfs_bmap_btree.h" +#include "xfs_alloc_btree.h" +#include "xfs_ialloc_btree.h" + +/* + * Physical superblock buffer manipulations. Shared with libxfs in userspace. + */ + +/* + * Reference counting access wrappers to the perag structures. + * Because we never free per-ag structures, the only thing we + * have to protect against changes is the tree structure itself. + */ +struct xfs_perag * +xfs_perag_get( + struct xfs_mount *mp, + xfs_agnumber_t agno) +{ + struct xfs_perag *pag; + int ref = 0; + + rcu_read_lock(); + pag = radix_tree_lookup(&mp->m_perag_tree, agno); + if (pag) { + ASSERT(atomic_read(&pag->pag_ref) >= 0); + ref = atomic_inc_return(&pag->pag_ref); + } + rcu_read_unlock(); + trace_xfs_perag_get(mp, agno, ref, _RET_IP_); + return pag; +} + +/* + * search from @first to find the next perag with the given tag set. + */ +struct xfs_perag * +xfs_perag_get_tag( + struct xfs_mount *mp, + xfs_agnumber_t first, + int tag) +{ + struct xfs_perag *pag; + int found; + int ref; + + rcu_read_lock(); + found = radix_tree_gang_lookup_tag(&mp->m_perag_tree, + (void **)&pag, first, 1, tag); + if (found <= 0) { + rcu_read_unlock(); + return NULL; + } + ref = atomic_inc_return(&pag->pag_ref); + rcu_read_unlock(); + trace_xfs_perag_get_tag(mp, pag->pag_agno, ref, _RET_IP_); + return pag; +} + +void +xfs_perag_put( + struct xfs_perag *pag) +{ + int ref; + + ASSERT(atomic_read(&pag->pag_ref) > 0); + ref = atomic_dec_return(&pag->pag_ref); + trace_xfs_perag_put(pag->pag_mount, pag->pag_agno, ref, _RET_IP_); +} + +/* + * Check the validity of the SB found. + */ +STATIC int +xfs_mount_validate_sb( + xfs_mount_t *mp, + xfs_sb_t *sbp, + bool check_inprogress, + bool check_version) +{ + if (sbp->sb_magicnum != XFS_SB_MAGIC) { + xfs_warn(mp, "bad magic number"); + return -EWRONGFS; + } + + + if (!xfs_sb_good_version(sbp)) { + xfs_warn(mp, "bad version"); + return -EWRONGFS; + } + + /* + * Version 5 superblock feature mask validation. Reject combinations the + * kernel cannot support up front before checking anything else. For + * write validation, we don't need to check feature masks. + */ + if (check_version && XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) { + if (xfs_sb_has_compat_feature(sbp, + XFS_SB_FEAT_COMPAT_UNKNOWN)) { + xfs_warn(mp, +"Superblock has unknown compatible features (0x%x) enabled.\n" +"Using a more recent kernel is recommended.", + (sbp->sb_features_compat & + XFS_SB_FEAT_COMPAT_UNKNOWN)); + } + + if (xfs_sb_has_ro_compat_feature(sbp, + XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) { + xfs_alert(mp, +"Superblock has unknown read-only compatible features (0x%x) enabled.", + (sbp->sb_features_ro_compat & + XFS_SB_FEAT_RO_COMPAT_UNKNOWN)); + if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { + xfs_warn(mp, +"Attempted to mount read-only compatible filesystem read-write.\n" +"Filesystem can only be safely mounted read only."); + return -EINVAL; + } + } + if (xfs_sb_has_incompat_feature(sbp, + XFS_SB_FEAT_INCOMPAT_UNKNOWN)) { + xfs_warn(mp, +"Superblock has unknown incompatible features (0x%x) enabled.\n" +"Filesystem can not be safely mounted by this kernel.", + (sbp->sb_features_incompat & + XFS_SB_FEAT_INCOMPAT_UNKNOWN)); + return -EINVAL; + } + } + + if (xfs_sb_version_has_pquotino(sbp)) { + if (sbp->sb_qflags & (XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD)) { + xfs_notice(mp, + "Version 5 of Super block has XFS_OQUOTA bits."); + return -EFSCORRUPTED; + } + } else if (sbp->sb_qflags & (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD | + XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD)) { + xfs_notice(mp, +"Superblock earlier than Version 5 has XFS_[PQ]UOTA_{ENFD|CHKD} bits."); + return -EFSCORRUPTED; + } + + if (unlikely( + sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) { + xfs_warn(mp, + "filesystem is marked as having an external log; " + "specify logdev on the mount command line."); + return -EINVAL; + } + + if (unlikely( + sbp->sb_logstart != 0 && mp->m_logdev_targp != mp->m_ddev_targp)) { + xfs_warn(mp, + "filesystem is marked as having an internal log; " + "do not specify logdev on the mount command line."); + return -EINVAL; + } + + /* + * More sanity checking. Most of these were stolen directly from + * xfs_repair. + */ + if (unlikely( + sbp->sb_agcount <= 0 || + sbp->sb_sectsize < XFS_MIN_SECTORSIZE || + sbp->sb_sectsize > XFS_MAX_SECTORSIZE || + sbp->sb_sectlog < XFS_MIN_SECTORSIZE_LOG || + sbp->sb_sectlog > XFS_MAX_SECTORSIZE_LOG || + sbp->sb_sectsize != (1 << sbp->sb_sectlog) || + sbp->sb_blocksize < XFS_MIN_BLOCKSIZE || + sbp->sb_blocksize > XFS_MAX_BLOCKSIZE || + sbp->sb_blocklog < XFS_MIN_BLOCKSIZE_LOG || + sbp->sb_blocklog > XFS_MAX_BLOCKSIZE_LOG || + sbp->sb_blocksize != (1 << sbp->sb_blocklog) || + sbp->sb_dirblklog > XFS_MAX_BLOCKSIZE_LOG || + sbp->sb_inodesize < XFS_DINODE_MIN_SIZE || + sbp->sb_inodesize > XFS_DINODE_MAX_SIZE || + sbp->sb_inodelog < XFS_DINODE_MIN_LOG || + sbp->sb_inodelog > XFS_DINODE_MAX_LOG || + sbp->sb_inodesize != (1 << sbp->sb_inodelog) || + sbp->sb_logsunit > XLOG_MAX_RECORD_BSIZE || + sbp->sb_inopblock != howmany(sbp->sb_blocksize,sbp->sb_inodesize) || + (sbp->sb_blocklog - sbp->sb_inodelog != sbp->sb_inopblog) || + (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE) || + (sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) || + (sbp->sb_imax_pct > 100 /* zero sb_imax_pct is valid */) || + sbp->sb_dblocks == 0 || + sbp->sb_dblocks > XFS_MAX_DBLOCKS(sbp) || + sbp->sb_dblocks < XFS_MIN_DBLOCKS(sbp) || + sbp->sb_shared_vn != 0)) { + xfs_notice(mp, "SB sanity check failed"); + return -EFSCORRUPTED; + } + + /* + * Until this is fixed only page-sized or smaller data blocks work. + */ + if (unlikely(sbp->sb_blocksize > PAGE_SIZE)) { + xfs_warn(mp, + "File system with blocksize %d bytes. " + "Only pagesize (%ld) or less will currently work.", + sbp->sb_blocksize, PAGE_SIZE); + return -ENOSYS; + } + + /* + * Currently only very few inode sizes are supported. + */ + switch (sbp->sb_inodesize) { + case 256: + case 512: + case 1024: + case 2048: + break; + default: + xfs_warn(mp, "inode size of %d bytes not supported", + sbp->sb_inodesize); + return -ENOSYS; + } + + if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) || + xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) { + xfs_warn(mp, + "file system too large to be mounted on this system."); + return -EFBIG; + } + + if (check_inprogress && sbp->sb_inprogress) { + xfs_warn(mp, "Offline file system operation in progress!"); + return -EFSCORRUPTED; + } + return 0; +} + +void +xfs_sb_quota_from_disk(struct xfs_sb *sbp) +{ + /* + * older mkfs doesn't initialize quota inodes to NULLFSINO. This + * leads to in-core values having two different values for a quota + * inode to be invalid: 0 and NULLFSINO. Change it to a single value + * NULLFSINO. + * + * Note that this change affect only the in-core values. These + * values are not written back to disk unless any quota information + * is written to the disk. Even in that case, sb_pquotino field is + * not written to disk unless the superblock supports pquotino. + */ + if (sbp->sb_uquotino == 0) + sbp->sb_uquotino = NULLFSINO; + if (sbp->sb_gquotino == 0) + sbp->sb_gquotino = NULLFSINO; + if (sbp->sb_pquotino == 0) + sbp->sb_pquotino = NULLFSINO; + + /* + * We need to do these manipilations only if we are working + * with an older version of on-disk superblock. + */ + if (xfs_sb_version_has_pquotino(sbp)) + return; + + if (sbp->sb_qflags & XFS_OQUOTA_ENFD) + sbp->sb_qflags |= (sbp->sb_qflags & XFS_PQUOTA_ACCT) ? + XFS_PQUOTA_ENFD : XFS_GQUOTA_ENFD; + if (sbp->sb_qflags & XFS_OQUOTA_CHKD) + sbp->sb_qflags |= (sbp->sb_qflags & XFS_PQUOTA_ACCT) ? + XFS_PQUOTA_CHKD : XFS_GQUOTA_CHKD; + sbp->sb_qflags &= ~(XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD); + + if (sbp->sb_qflags & XFS_PQUOTA_ACCT) { + /* + * In older version of superblock, on-disk superblock only + * has sb_gquotino, and in-core superblock has both sb_gquotino + * and sb_pquotino. But, only one of them is supported at any + * point of time. So, if PQUOTA is set in disk superblock, + * copy over sb_gquotino to sb_pquotino. + */ + sbp->sb_pquotino = sbp->sb_gquotino; + sbp->sb_gquotino = NULLFSINO; + } +} + +static void +__xfs_sb_from_disk( + struct xfs_sb *to, + xfs_dsb_t *from, + bool convert_xquota) +{ + to->sb_magicnum = be32_to_cpu(from->sb_magicnum); + to->sb_blocksize = be32_to_cpu(from->sb_blocksize); + to->sb_dblocks = be64_to_cpu(from->sb_dblocks); + to->sb_rblocks = be64_to_cpu(from->sb_rblocks); + to->sb_rextents = be64_to_cpu(from->sb_rextents); + memcpy(&to->sb_uuid, &from->sb_uuid, sizeof(to->sb_uuid)); + to->sb_logstart = be64_to_cpu(from->sb_logstart); + to->sb_rootino = be64_to_cpu(from->sb_rootino); + to->sb_rbmino = be64_to_cpu(from->sb_rbmino); + to->sb_rsumino = be64_to_cpu(from->sb_rsumino); + to->sb_rextsize = be32_to_cpu(from->sb_rextsize); + to->sb_agblocks = be32_to_cpu(from->sb_agblocks); + to->sb_agcount = be32_to_cpu(from->sb_agcount); + to->sb_rbmblocks = be32_to_cpu(from->sb_rbmblocks); + to->sb_logblocks = be32_to_cpu(from->sb_logblocks); + to->sb_versionnum = be16_to_cpu(from->sb_versionnum); + to->sb_sectsize = be16_to_cpu(from->sb_sectsize); + to->sb_inodesize = be16_to_cpu(from->sb_inodesize); + to->sb_inopblock = be16_to_cpu(from->sb_inopblock); + memcpy(&to->sb_fname, &from->sb_fname, sizeof(to->sb_fname)); + to->sb_blocklog = from->sb_blocklog; + to->sb_sectlog = from->sb_sectlog; + to->sb_inodelog = from->sb_inodelog; + to->sb_inopblog = from->sb_inopblog; + to->sb_agblklog = from->sb_agblklog; + to->sb_rextslog = from->sb_rextslog; + to->sb_inprogress = from->sb_inprogress; + to->sb_imax_pct = from->sb_imax_pct; + to->sb_icount = be64_to_cpu(from->sb_icount); + to->sb_ifree = be64_to_cpu(from->sb_ifree); + to->sb_fdblocks = be64_to_cpu(from->sb_fdblocks); + to->sb_frextents = be64_to_cpu(from->sb_frextents); + to->sb_uquotino = be64_to_cpu(from->sb_uquotino); + to->sb_gquotino = be64_to_cpu(from->sb_gquotino); + to->sb_qflags = be16_to_cpu(from->sb_qflags); + to->sb_flags = from->sb_flags; + to->sb_shared_vn = from->sb_shared_vn; + to->sb_inoalignmt = be32_to_cpu(from->sb_inoalignmt); + to->sb_unit = be32_to_cpu(from->sb_unit); + to->sb_width = be32_to_cpu(from->sb_width); + to->sb_dirblklog = from->sb_dirblklog; + to->sb_logsectlog = from->sb_logsectlog; + to->sb_logsectsize = be16_to_cpu(from->sb_logsectsize); + to->sb_logsunit = be32_to_cpu(from->sb_logsunit); + to->sb_features2 = be32_to_cpu(from->sb_features2); + to->sb_bad_features2 = be32_to_cpu(from->sb_bad_features2); + to->sb_features_compat = be32_to_cpu(from->sb_features_compat); + to->sb_features_ro_compat = be32_to_cpu(from->sb_features_ro_compat); + to->sb_features_incompat = be32_to_cpu(from->sb_features_incompat); + to->sb_features_log_incompat = + be32_to_cpu(from->sb_features_log_incompat); + /* crc is only used on disk, not in memory; just init to 0 here. */ + to->sb_crc = 0; + to->sb_pad = 0; + to->sb_pquotino = be64_to_cpu(from->sb_pquotino); + to->sb_lsn = be64_to_cpu(from->sb_lsn); + /* Convert on-disk flags to in-memory flags? */ + if (convert_xquota) + xfs_sb_quota_from_disk(to); +} + +void +xfs_sb_from_disk( + struct xfs_sb *to, + xfs_dsb_t *from) +{ + __xfs_sb_from_disk(to, from, true); +} + +static void +xfs_sb_quota_to_disk( + struct xfs_dsb *to, + struct xfs_sb *from) +{ + __uint16_t qflags = from->sb_qflags; + + to->sb_uquotino = cpu_to_be64(from->sb_uquotino); + if (xfs_sb_version_has_pquotino(from)) { + to->sb_qflags = cpu_to_be16(from->sb_qflags); + to->sb_gquotino = cpu_to_be64(from->sb_gquotino); + to->sb_pquotino = cpu_to_be64(from->sb_pquotino); + return; + } + + /* + * The in-core version of sb_qflags do not have XFS_OQUOTA_* + * flags, whereas the on-disk version does. So, convert incore + * XFS_{PG}QUOTA_* flags to on-disk XFS_OQUOTA_* flags. + */ + qflags &= ~(XFS_PQUOTA_ENFD | XFS_PQUOTA_CHKD | + XFS_GQUOTA_ENFD | XFS_GQUOTA_CHKD); + + if (from->sb_qflags & + (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD)) + qflags |= XFS_OQUOTA_ENFD; + if (from->sb_qflags & + (XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD)) + qflags |= XFS_OQUOTA_CHKD; + to->sb_qflags = cpu_to_be16(qflags); + + /* + * GQUOTINO and PQUOTINO cannot be used together in versions + * of superblock that do not have pquotino. from->sb_flags + * tells us which quota is active and should be copied to + * disk. If neither are active, we should NULL the inode. + * + * In all cases, the separate pquotino must remain 0 because it + * it beyond the "end" of the valid non-pquotino superblock. + */ + if (from->sb_qflags & XFS_GQUOTA_ACCT) + to->sb_gquotino = cpu_to_be64(from->sb_gquotino); + else if (from->sb_qflags & XFS_PQUOTA_ACCT) + to->sb_gquotino = cpu_to_be64(from->sb_pquotino); + else { + /* + * We can't rely on just the fields being logged to tell us + * that it is safe to write NULLFSINO - we should only do that + * if quotas are not actually enabled. Hence only write + * NULLFSINO if both in-core quota inodes are NULL. + */ + if (from->sb_gquotino == NULLFSINO && + from->sb_pquotino == NULLFSINO) + to->sb_gquotino = cpu_to_be64(NULLFSINO); + } + + to->sb_pquotino = 0; +} + +void +xfs_sb_to_disk( + struct xfs_dsb *to, + struct xfs_sb *from) +{ + xfs_sb_quota_to_disk(to, from); + + to->sb_magicnum = cpu_to_be32(from->sb_magicnum); + to->sb_blocksize = cpu_to_be32(from->sb_blocksize); + to->sb_dblocks = cpu_to_be64(from->sb_dblocks); + to->sb_rblocks = cpu_to_be64(from->sb_rblocks); + to->sb_rextents = cpu_to_be64(from->sb_rextents); + memcpy(&to->sb_uuid, &from->sb_uuid, sizeof(to->sb_uuid)); + to->sb_logstart = cpu_to_be64(from->sb_logstart); + to->sb_rootino = cpu_to_be64(from->sb_rootino); + to->sb_rbmino = cpu_to_be64(from->sb_rbmino); + to->sb_rsumino = cpu_to_be64(from->sb_rsumino); + to->sb_rextsize = cpu_to_be32(from->sb_rextsize); + to->sb_agblocks = cpu_to_be32(from->sb_agblocks); + to->sb_agcount = cpu_to_be32(from->sb_agcount); + to->sb_rbmblocks = cpu_to_be32(from->sb_rbmblocks); + to->sb_logblocks = cpu_to_be32(from->sb_logblocks); + to->sb_versionnum = cpu_to_be16(from->sb_versionnum); + to->sb_sectsize = cpu_to_be16(from->sb_sectsize); + to->sb_inodesize = cpu_to_be16(from->sb_inodesize); + to->sb_inopblock = cpu_to_be16(from->sb_inopblock); + memcpy(&to->sb_fname, &from->sb_fname, sizeof(to->sb_fname)); + to->sb_blocklog = from->sb_blocklog; + to->sb_sectlog = from->sb_sectlog; + to->sb_inodelog = from->sb_inodelog; + to->sb_inopblog = from->sb_inopblog; + to->sb_agblklog = from->sb_agblklog; + to->sb_rextslog = from->sb_rextslog; + to->sb_inprogress = from->sb_inprogress; + to->sb_imax_pct = from->sb_imax_pct; + to->sb_icount = cpu_to_be64(from->sb_icount); + to->sb_ifree = cpu_to_be64(from->sb_ifree); + to->sb_fdblocks = cpu_to_be64(from->sb_fdblocks); + to->sb_frextents = cpu_to_be64(from->sb_frextents); + + to->sb_flags = from->sb_flags; + to->sb_shared_vn = from->sb_shared_vn; + to->sb_inoalignmt = cpu_to_be32(from->sb_inoalignmt); + to->sb_unit = cpu_to_be32(from->sb_unit); + to->sb_width = cpu_to_be32(from->sb_width); + to->sb_dirblklog = from->sb_dirblklog; + to->sb_logsectlog = from->sb_logsectlog; + to->sb_logsectsize = cpu_to_be16(from->sb_logsectsize); + to->sb_logsunit = cpu_to_be32(from->sb_logsunit); + + /* + * We need to ensure that bad_features2 always matches features2. + * Hence we enforce that here rather than having to remember to do it + * everywhere else that updates features2. + */ + from->sb_bad_features2 = from->sb_features2; + to->sb_features2 = cpu_to_be32(from->sb_features2); + to->sb_bad_features2 = cpu_to_be32(from->sb_bad_features2); + + if (xfs_sb_version_hascrc(from)) { + to->sb_features_compat = cpu_to_be32(from->sb_features_compat); + to->sb_features_ro_compat = + cpu_to_be32(from->sb_features_ro_compat); + to->sb_features_incompat = + cpu_to_be32(from->sb_features_incompat); + to->sb_features_log_incompat = + cpu_to_be32(from->sb_features_log_incompat); + to->sb_pad = 0; + to->sb_lsn = cpu_to_be64(from->sb_lsn); + } +} + +static int +xfs_sb_verify( + struct xfs_buf *bp, + bool check_version) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_sb sb; + + /* + * Use call variant which doesn't convert quota flags from disk + * format, because xfs_mount_validate_sb checks the on-disk flags. + */ + __xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp), false); + + /* + * Only check the in progress field for the primary superblock as + * mkfs.xfs doesn't clear it from secondary superblocks. + */ + return xfs_mount_validate_sb(mp, &sb, bp->b_bn == XFS_SB_DADDR, + check_version); +} + +/* + * If the superblock has the CRC feature bit set or the CRC field is non-null, + * check that the CRC is valid. We check the CRC field is non-null because a + * single bit error could clear the feature bit and unused parts of the + * superblock are supposed to be zero. Hence a non-null crc field indicates that + * we've potentially lost a feature bit and we should check it anyway. + * + * However, past bugs (i.e. in growfs) left non-zeroed regions beyond the + * last field in V4 secondary superblocks. So for secondary superblocks, + * we are more forgiving, and ignore CRC failures if the primary doesn't + * indicate that the fs version is V5. + */ +static void +xfs_sb_read_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp); + int error; + + /* + * open code the version check to avoid needing to convert the entire + * superblock from disk order just to check the version number + */ + if (dsb->sb_magicnum == cpu_to_be32(XFS_SB_MAGIC) && + (((be16_to_cpu(dsb->sb_versionnum) & XFS_SB_VERSION_NUMBITS) == + XFS_SB_VERSION_5) || + dsb->sb_crc != 0)) { + + if (!xfs_buf_verify_cksum(bp, XFS_SB_CRC_OFF)) { + /* Only fail bad secondaries on a known V5 filesystem */ + if (bp->b_bn == XFS_SB_DADDR || + xfs_sb_version_hascrc(&mp->m_sb)) { + error = -EFSBADCRC; + goto out_error; + } + } + } + error = xfs_sb_verify(bp, true); + +out_error: + if (error) { + xfs_buf_ioerror(bp, error); + if (error == -EFSCORRUPTED || error == -EFSBADCRC) + xfs_verifier_error(bp); + } +} + +/* + * We may be probed for a filesystem match, so we may not want to emit + * messages when the superblock buffer is not actually an XFS superblock. + * If we find an XFS superblock, then run a normal, noisy mount because we are + * really going to mount it and want to know about errors. + */ +static void +xfs_sb_quiet_read_verify( + struct xfs_buf *bp) +{ + struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp); + + if (dsb->sb_magicnum == cpu_to_be32(XFS_SB_MAGIC)) { + /* XFS filesystem, verify noisily! */ + xfs_sb_read_verify(bp); + return; + } + /* quietly fail */ + xfs_buf_ioerror(bp, -EWRONGFS); +} + +static void +xfs_sb_write_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; + int error; + + error = xfs_sb_verify(bp, false); + if (error) { + xfs_buf_ioerror(bp, error); + xfs_verifier_error(bp); + return; + } + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (bip) + XFS_BUF_TO_SBP(bp)->sb_lsn = cpu_to_be64(bip->bli_item.li_lsn); + + xfs_buf_update_cksum(bp, XFS_SB_CRC_OFF); +} + +const struct xfs_buf_ops xfs_sb_buf_ops = { + .verify_read = xfs_sb_read_verify, + .verify_write = xfs_sb_write_verify, +}; + +const struct xfs_buf_ops xfs_sb_quiet_buf_ops = { + .verify_read = xfs_sb_quiet_read_verify, + .verify_write = xfs_sb_write_verify, +}; + +/* + * xfs_mount_common + * + * Mount initialization code establishing various mount + * fields from the superblock associated with the given + * mount structure + */ +void +xfs_sb_mount_common( + struct xfs_mount *mp, + struct xfs_sb *sbp) +{ + mp->m_agfrotor = mp->m_agirotor = 0; + spin_lock_init(&mp->m_agirotor_lock); + mp->m_maxagi = mp->m_sb.sb_agcount; + mp->m_blkbit_log = sbp->sb_blocklog + XFS_NBBYLOG; + mp->m_blkbb_log = sbp->sb_blocklog - BBSHIFT; + mp->m_sectbb_log = sbp->sb_sectlog - BBSHIFT; + mp->m_agno_log = xfs_highbit32(sbp->sb_agcount - 1) + 1; + mp->m_agino_log = sbp->sb_inopblog + sbp->sb_agblklog; + mp->m_blockmask = sbp->sb_blocksize - 1; + mp->m_blockwsize = sbp->sb_blocksize >> XFS_WORDLOG; + mp->m_blockwmask = mp->m_blockwsize - 1; + + mp->m_alloc_mxr[0] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 1); + mp->m_alloc_mxr[1] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 0); + mp->m_alloc_mnr[0] = mp->m_alloc_mxr[0] / 2; + mp->m_alloc_mnr[1] = mp->m_alloc_mxr[1] / 2; + + mp->m_inobt_mxr[0] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 1); + mp->m_inobt_mxr[1] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 0); + mp->m_inobt_mnr[0] = mp->m_inobt_mxr[0] / 2; + mp->m_inobt_mnr[1] = mp->m_inobt_mxr[1] / 2; + + mp->m_bmap_dmxr[0] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 1); + mp->m_bmap_dmxr[1] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 0); + mp->m_bmap_dmnr[0] = mp->m_bmap_dmxr[0] / 2; + mp->m_bmap_dmnr[1] = mp->m_bmap_dmxr[1] / 2; + + mp->m_bsize = XFS_FSB_TO_BB(mp, 1); + mp->m_ialloc_inos = (int)MAX((__uint16_t)XFS_INODES_PER_CHUNK, + sbp->sb_inopblock); + mp->m_ialloc_blks = mp->m_ialloc_inos >> sbp->sb_inopblog; +} + +/* + * xfs_initialize_perag_data + * + * Read in each per-ag structure so we can count up the number of + * allocated inodes, free inodes and used filesystem blocks as this + * information is no longer persistent in the superblock. Once we have + * this information, write it into the in-core superblock structure. + */ +int +xfs_initialize_perag_data( + struct xfs_mount *mp, + xfs_agnumber_t agcount) +{ + xfs_agnumber_t index; + xfs_perag_t *pag; + xfs_sb_t *sbp = &mp->m_sb; + uint64_t ifree = 0; + uint64_t ialloc = 0; + uint64_t bfree = 0; + uint64_t bfreelst = 0; + uint64_t btree = 0; + int error; + + for (index = 0; index < agcount; index++) { + /* + * read the agf, then the agi. This gets us + * all the information we need and populates the + * per-ag structures for us. + */ + error = xfs_alloc_pagf_init(mp, NULL, index, 0); + if (error) + return error; + + error = xfs_ialloc_pagi_init(mp, NULL, index); + if (error) + return error; + pag = xfs_perag_get(mp, index); + ifree += pag->pagi_freecount; + ialloc += pag->pagi_count; + bfree += pag->pagf_freeblks; + bfreelst += pag->pagf_flcount; + btree += pag->pagf_btreeblks; + xfs_perag_put(pag); + } + + /* Overwrite incore superblock counters with just-read data */ + spin_lock(&mp->m_sb_lock); + sbp->sb_ifree = ifree; + sbp->sb_icount = ialloc; + sbp->sb_fdblocks = bfree + bfreelst + btree; + spin_unlock(&mp->m_sb_lock); + + xfs_reinit_percpu_counters(mp); + + return 0; +} + +/* + * xfs_log_sb() can be used to copy arbitrary changes to the in-core superblock + * into the superblock buffer to be logged. It does not provide the higher + * level of locking that is needed to protect the in-core superblock from + * concurrent access. + */ +void +xfs_log_sb( + struct xfs_trans *tp) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_buf *bp = xfs_trans_getsb(tp, mp, 0); + + mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount); + mp->m_sb.sb_ifree = percpu_counter_sum(&mp->m_ifree); + mp->m_sb.sb_fdblocks = percpu_counter_sum(&mp->m_fdblocks); + + xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb); + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF); + xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsb)); +} + +/* + * xfs_sync_sb + * + * Sync the superblock to disk. + * + * Note that the caller is responsible for checking the frozen state of the + * filesystem. This procedure uses the non-blocking transaction allocator and + * thus will allow modifications to a frozen fs. This is required because this + * code can be called during the process of freezing where use of the high-level + * allocator would deadlock. + */ +int +xfs_sync_sb( + struct xfs_mount *mp, + bool wait) +{ + struct xfs_trans *tp; + int error; + + tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_CHANGE, KM_SLEEP); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_sb, 0, 0); + if (error) { + xfs_trans_cancel(tp, 0); + return error; + } + + xfs_log_sb(tp); + if (wait) + xfs_trans_set_sync(tp); + return xfs_trans_commit(tp, 0); +} diff --git a/kernel/fs/xfs/libxfs/xfs_sb.h b/kernel/fs/xfs/libxfs/xfs_sb.h new file mode 100644 index 000000000..b25bb9a34 --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_sb.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_SB_H__ +#define __XFS_SB_H__ + +/* + * perag get/put wrappers for ref counting + */ +extern struct xfs_perag *xfs_perag_get(struct xfs_mount *, xfs_agnumber_t); +extern struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *, xfs_agnumber_t, + int tag); +extern void xfs_perag_put(struct xfs_perag *pag); +extern int xfs_initialize_perag_data(struct xfs_mount *, xfs_agnumber_t); + +extern void xfs_sb_calc_crc(struct xfs_buf *bp); +extern void xfs_log_sb(struct xfs_trans *tp); +extern int xfs_sync_sb(struct xfs_mount *mp, bool wait); +extern void xfs_sb_mount_common(struct xfs_mount *mp, struct xfs_sb *sbp); +extern void xfs_sb_from_disk(struct xfs_sb *to, struct xfs_dsb *from); +extern void xfs_sb_to_disk(struct xfs_dsb *to, struct xfs_sb *from); +extern void xfs_sb_quota_from_disk(struct xfs_sb *sbp); + +#endif /* __XFS_SB_H__ */ diff --git a/kernel/fs/xfs/libxfs/xfs_shared.h b/kernel/fs/xfs/libxfs/xfs_shared.h new file mode 100644 index 000000000..8dda4b321 --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_shared.h @@ -0,0 +1,243 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_SHARED_H__ +#define __XFS_SHARED_H__ + +/* + * Definitions shared between kernel and userspace that don't fit into any other + * header file that is shared with userspace. + */ +struct xfs_ifork; +struct xfs_buf; +struct xfs_buf_ops; +struct xfs_mount; +struct xfs_trans; +struct xfs_inode; + +/* + * Buffer verifier operations are widely used, including userspace tools + */ +extern const struct xfs_buf_ops xfs_agf_buf_ops; +extern const struct xfs_buf_ops xfs_agi_buf_ops; +extern const struct xfs_buf_ops xfs_agf_buf_ops; +extern const struct xfs_buf_ops xfs_agfl_buf_ops; +extern const struct xfs_buf_ops xfs_allocbt_buf_ops; +extern const struct xfs_buf_ops xfs_attr3_leaf_buf_ops; +extern const struct xfs_buf_ops xfs_attr3_rmt_buf_ops; +extern const struct xfs_buf_ops xfs_bmbt_buf_ops; +extern const struct xfs_buf_ops xfs_da3_node_buf_ops; +extern const struct xfs_buf_ops xfs_dquot_buf_ops; +extern const struct xfs_buf_ops xfs_symlink_buf_ops; +extern const struct xfs_buf_ops xfs_agi_buf_ops; +extern const struct xfs_buf_ops xfs_inobt_buf_ops; +extern const struct xfs_buf_ops xfs_inode_buf_ops; +extern const struct xfs_buf_ops xfs_inode_buf_ra_ops; +extern const struct xfs_buf_ops xfs_dquot_buf_ops; +extern const struct xfs_buf_ops xfs_sb_buf_ops; +extern const struct xfs_buf_ops xfs_sb_quiet_buf_ops; +extern const struct xfs_buf_ops xfs_symlink_buf_ops; + +/* + * Transaction types. Used to distinguish types of buffers. These never reach + * the log. + */ +#define XFS_TRANS_SETATTR_NOT_SIZE 1 +#define XFS_TRANS_SETATTR_SIZE 2 +#define XFS_TRANS_INACTIVE 3 +#define XFS_TRANS_CREATE 4 +#define XFS_TRANS_CREATE_TRUNC 5 +#define XFS_TRANS_TRUNCATE_FILE 6 +#define XFS_TRANS_REMOVE 7 +#define XFS_TRANS_LINK 8 +#define XFS_TRANS_RENAME 9 +#define XFS_TRANS_MKDIR 10 +#define XFS_TRANS_RMDIR 11 +#define XFS_TRANS_SYMLINK 12 +#define XFS_TRANS_SET_DMATTRS 13 +#define XFS_TRANS_GROWFS 14 +#define XFS_TRANS_STRAT_WRITE 15 +#define XFS_TRANS_DIOSTRAT 16 +/* 17 was XFS_TRANS_WRITE_SYNC */ +#define XFS_TRANS_WRITEID 18 +#define XFS_TRANS_ADDAFORK 19 +#define XFS_TRANS_ATTRINVAL 20 +#define XFS_TRANS_ATRUNCATE 21 +#define XFS_TRANS_ATTR_SET 22 +#define XFS_TRANS_ATTR_RM 23 +#define XFS_TRANS_ATTR_FLAG 24 +#define XFS_TRANS_CLEAR_AGI_BUCKET 25 +#define XFS_TRANS_SB_CHANGE 26 +/* + * Dummy entries since we use the transaction type to index into the + * trans_type[] in xlog_recover_print_trans_head() + */ +#define XFS_TRANS_DUMMY1 27 +#define XFS_TRANS_DUMMY2 28 +#define XFS_TRANS_QM_QUOTAOFF 29 +#define XFS_TRANS_QM_DQALLOC 30 +#define XFS_TRANS_QM_SETQLIM 31 +#define XFS_TRANS_QM_DQCLUSTER 32 +#define XFS_TRANS_QM_QINOCREATE 33 +#define XFS_TRANS_QM_QUOTAOFF_END 34 +#define XFS_TRANS_FSYNC_TS 35 +#define XFS_TRANS_GROWFSRT_ALLOC 36 +#define XFS_TRANS_GROWFSRT_ZERO 37 +#define XFS_TRANS_GROWFSRT_FREE 38 +#define XFS_TRANS_SWAPEXT 39 +#define XFS_TRANS_CHECKPOINT 40 +#define XFS_TRANS_ICREATE 41 +#define XFS_TRANS_CREATE_TMPFILE 42 +#define XFS_TRANS_TYPE_MAX 43 +/* new transaction types need to be reflected in xfs_logprint(8) */ + +#define XFS_TRANS_TYPES \ + { XFS_TRANS_SETATTR_NOT_SIZE, "SETATTR_NOT_SIZE" }, \ + { XFS_TRANS_SETATTR_SIZE, "SETATTR_SIZE" }, \ + { XFS_TRANS_INACTIVE, "INACTIVE" }, \ + { XFS_TRANS_CREATE, "CREATE" }, \ + { XFS_TRANS_CREATE_TRUNC, "CREATE_TRUNC" }, \ + { XFS_TRANS_TRUNCATE_FILE, "TRUNCATE_FILE" }, \ + { XFS_TRANS_REMOVE, "REMOVE" }, \ + { XFS_TRANS_LINK, "LINK" }, \ + { XFS_TRANS_RENAME, "RENAME" }, \ + { XFS_TRANS_MKDIR, "MKDIR" }, \ + { XFS_TRANS_RMDIR, "RMDIR" }, \ + { XFS_TRANS_SYMLINK, "SYMLINK" }, \ + { XFS_TRANS_SET_DMATTRS, "SET_DMATTRS" }, \ + { XFS_TRANS_GROWFS, "GROWFS" }, \ + { XFS_TRANS_STRAT_WRITE, "STRAT_WRITE" }, \ + { XFS_TRANS_DIOSTRAT, "DIOSTRAT" }, \ + { XFS_TRANS_WRITEID, "WRITEID" }, \ + { XFS_TRANS_ADDAFORK, "ADDAFORK" }, \ + { XFS_TRANS_ATTRINVAL, "ATTRINVAL" }, \ + { XFS_TRANS_ATRUNCATE, "ATRUNCATE" }, \ + { XFS_TRANS_ATTR_SET, "ATTR_SET" }, \ + { XFS_TRANS_ATTR_RM, "ATTR_RM" }, \ + { XFS_TRANS_ATTR_FLAG, "ATTR_FLAG" }, \ + { XFS_TRANS_CLEAR_AGI_BUCKET, "CLEAR_AGI_BUCKET" }, \ + { XFS_TRANS_SB_CHANGE, "SBCHANGE" }, \ + { XFS_TRANS_DUMMY1, "DUMMY1" }, \ + { XFS_TRANS_DUMMY2, "DUMMY2" }, \ + { XFS_TRANS_QM_QUOTAOFF, "QM_QUOTAOFF" }, \ + { XFS_TRANS_QM_DQALLOC, "QM_DQALLOC" }, \ + { XFS_TRANS_QM_SETQLIM, "QM_SETQLIM" }, \ + { XFS_TRANS_QM_DQCLUSTER, "QM_DQCLUSTER" }, \ + { XFS_TRANS_QM_QINOCREATE, "QM_QINOCREATE" }, \ + { XFS_TRANS_QM_QUOTAOFF_END, "QM_QOFF_END" }, \ + { XFS_TRANS_FSYNC_TS, "FSYNC_TS" }, \ + { XFS_TRANS_GROWFSRT_ALLOC, "GROWFSRT_ALLOC" }, \ + { XFS_TRANS_GROWFSRT_ZERO, "GROWFSRT_ZERO" }, \ + { XFS_TRANS_GROWFSRT_FREE, "GROWFSRT_FREE" }, \ + { XFS_TRANS_SWAPEXT, "SWAPEXT" }, \ + { XFS_TRANS_CHECKPOINT, "CHECKPOINT" }, \ + { XFS_TRANS_ICREATE, "ICREATE" }, \ + { XFS_TRANS_CREATE_TMPFILE, "CREATE_TMPFILE" }, \ + { XLOG_UNMOUNT_REC_TYPE, "UNMOUNT" } + +/* + * This structure is used to track log items associated with + * a transaction. It points to the log item and keeps some + * flags to track the state of the log item. It also tracks + * the amount of space needed to log the item it describes + * once we get to commit processing (see xfs_trans_commit()). + */ +struct xfs_log_item_desc { + struct xfs_log_item *lid_item; + struct list_head lid_trans; + unsigned char lid_flags; +}; + +#define XFS_LID_DIRTY 0x1 + +/* log size calculation functions */ +int xfs_log_calc_unit_res(struct xfs_mount *mp, int unit_bytes); +int xfs_log_calc_minimum_size(struct xfs_mount *); + + +/* + * Values for t_flags. + */ +#define XFS_TRANS_DIRTY 0x01 /* something needs to be logged */ +#define XFS_TRANS_SB_DIRTY 0x02 /* superblock is modified */ +#define XFS_TRANS_PERM_LOG_RES 0x04 /* xact took a permanent log res */ +#define XFS_TRANS_SYNC 0x08 /* make commit synchronous */ +#define XFS_TRANS_DQ_DIRTY 0x10 /* at least one dquot in trx dirty */ +#define XFS_TRANS_RESERVE 0x20 /* OK to use reserved data blocks */ +#define XFS_TRANS_FREEZE_PROT 0x40 /* Transaction has elevated writer + count in superblock */ +/* + * Values for call flags parameter. + */ +#define XFS_TRANS_RELEASE_LOG_RES 0x4 +#define XFS_TRANS_ABORT 0x8 + +/* + * Field values for xfs_trans_mod_sb. + */ +#define XFS_TRANS_SB_ICOUNT 0x00000001 +#define XFS_TRANS_SB_IFREE 0x00000002 +#define XFS_TRANS_SB_FDBLOCKS 0x00000004 +#define XFS_TRANS_SB_RES_FDBLOCKS 0x00000008 +#define XFS_TRANS_SB_FREXTENTS 0x00000010 +#define XFS_TRANS_SB_RES_FREXTENTS 0x00000020 +#define XFS_TRANS_SB_DBLOCKS 0x00000040 +#define XFS_TRANS_SB_AGCOUNT 0x00000080 +#define XFS_TRANS_SB_IMAXPCT 0x00000100 +#define XFS_TRANS_SB_REXTSIZE 0x00000200 +#define XFS_TRANS_SB_RBMBLOCKS 0x00000400 +#define XFS_TRANS_SB_RBLOCKS 0x00000800 +#define XFS_TRANS_SB_REXTENTS 0x00001000 +#define XFS_TRANS_SB_REXTSLOG 0x00002000 + +/* + * Here we centralize the specification of XFS meta-data buffer reference count + * values. This determines how hard the buffer cache tries to hold onto the + * buffer. + */ +#define XFS_AGF_REF 4 +#define XFS_AGI_REF 4 +#define XFS_AGFL_REF 3 +#define XFS_INO_BTREE_REF 3 +#define XFS_ALLOC_BTREE_REF 2 +#define XFS_BMAP_BTREE_REF 2 +#define XFS_DIR_BTREE_REF 2 +#define XFS_INO_REF 2 +#define XFS_ATTR_BTREE_REF 1 +#define XFS_DQUOT_REF 1 + +/* + * Flags for xfs_trans_ichgtime(). + */ +#define XFS_ICHGTIME_MOD 0x1 /* data fork modification timestamp */ +#define XFS_ICHGTIME_CHG 0x2 /* inode field change timestamp */ +#define XFS_ICHGTIME_CREATE 0x4 /* inode create timestamp */ + + +/* + * Symlink decoding/encoding functions + */ +int xfs_symlink_blocks(struct xfs_mount *mp, int pathlen); +int xfs_symlink_hdr_set(struct xfs_mount *mp, xfs_ino_t ino, uint32_t offset, + uint32_t size, struct xfs_buf *bp); +bool xfs_symlink_hdr_ok(xfs_ino_t ino, uint32_t offset, + uint32_t size, struct xfs_buf *bp); +void xfs_symlink_local_to_remote(struct xfs_trans *tp, struct xfs_buf *bp, + struct xfs_inode *ip, struct xfs_ifork *ifp); + +#endif /* __XFS_SHARED_H__ */ diff --git a/kernel/fs/xfs/libxfs/xfs_symlink_remote.c b/kernel/fs/xfs/libxfs/xfs_symlink_remote.c new file mode 100644 index 000000000..e7e26bd64 --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_symlink_remote.c @@ -0,0 +1,201 @@ +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * Copyright (c) 2012-2013 Red Hat, Inc. + * All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_shared.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_bmap_btree.h" +#include "xfs_inode.h" +#include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_symlink.h" +#include "xfs_cksum.h" +#include "xfs_trans.h" +#include "xfs_buf_item.h" + + +/* + * Each contiguous block has a header, so it is not just a simple pathlen + * to FSB conversion. + */ +int +xfs_symlink_blocks( + struct xfs_mount *mp, + int pathlen) +{ + int buflen = XFS_SYMLINK_BUF_SPACE(mp, mp->m_sb.sb_blocksize); + + return (pathlen + buflen - 1) / buflen; +} + +int +xfs_symlink_hdr_set( + struct xfs_mount *mp, + xfs_ino_t ino, + uint32_t offset, + uint32_t size, + struct xfs_buf *bp) +{ + struct xfs_dsymlink_hdr *dsl = bp->b_addr; + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return 0; + + dsl->sl_magic = cpu_to_be32(XFS_SYMLINK_MAGIC); + dsl->sl_offset = cpu_to_be32(offset); + dsl->sl_bytes = cpu_to_be32(size); + uuid_copy(&dsl->sl_uuid, &mp->m_sb.sb_uuid); + dsl->sl_owner = cpu_to_be64(ino); + dsl->sl_blkno = cpu_to_be64(bp->b_bn); + bp->b_ops = &xfs_symlink_buf_ops; + + return sizeof(struct xfs_dsymlink_hdr); +} + +/* + * Checking of the symlink header is split into two parts. the verifier does + * CRC, location and bounds checking, the unpacking function checks the path + * parameters and owner. + */ +bool +xfs_symlink_hdr_ok( + xfs_ino_t ino, + uint32_t offset, + uint32_t size, + struct xfs_buf *bp) +{ + struct xfs_dsymlink_hdr *dsl = bp->b_addr; + + if (offset != be32_to_cpu(dsl->sl_offset)) + return false; + if (size != be32_to_cpu(dsl->sl_bytes)) + return false; + if (ino != be64_to_cpu(dsl->sl_owner)) + return false; + + /* ok */ + return true; +} + +static bool +xfs_symlink_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_dsymlink_hdr *dsl = bp->b_addr; + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return false; + if (dsl->sl_magic != cpu_to_be32(XFS_SYMLINK_MAGIC)) + return false; + if (!uuid_equal(&dsl->sl_uuid, &mp->m_sb.sb_uuid)) + return false; + if (bp->b_bn != be64_to_cpu(dsl->sl_blkno)) + return false; + if (be32_to_cpu(dsl->sl_offset) + + be32_to_cpu(dsl->sl_bytes) >= MAXPATHLEN) + return false; + if (dsl->sl_owner == 0) + return false; + + return true; +} + +static void +xfs_symlink_read_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + + /* no verification of non-crc buffers */ + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (!xfs_buf_verify_cksum(bp, XFS_SYMLINK_CRC_OFF)) + xfs_buf_ioerror(bp, -EFSBADCRC); + else if (!xfs_symlink_verify(bp)) + xfs_buf_ioerror(bp, -EFSCORRUPTED); + + if (bp->b_error) + xfs_verifier_error(bp); +} + +static void +xfs_symlink_write_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; + + /* no verification of non-crc buffers */ + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (!xfs_symlink_verify(bp)) { + xfs_buf_ioerror(bp, -EFSCORRUPTED); + xfs_verifier_error(bp); + return; + } + + if (bip) { + struct xfs_dsymlink_hdr *dsl = bp->b_addr; + dsl->sl_lsn = cpu_to_be64(bip->bli_item.li_lsn); + } + xfs_buf_update_cksum(bp, XFS_SYMLINK_CRC_OFF); +} + +const struct xfs_buf_ops xfs_symlink_buf_ops = { + .verify_read = xfs_symlink_read_verify, + .verify_write = xfs_symlink_write_verify, +}; + +void +xfs_symlink_local_to_remote( + struct xfs_trans *tp, + struct xfs_buf *bp, + struct xfs_inode *ip, + struct xfs_ifork *ifp) +{ + struct xfs_mount *mp = ip->i_mount; + char *buf; + + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SYMLINK_BUF); + + if (!xfs_sb_version_hascrc(&mp->m_sb)) { + bp->b_ops = NULL; + memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes); + return; + } + + /* + * As this symlink fits in an inode literal area, it must also fit in + * the smallest buffer the filesystem supports. + */ + ASSERT(BBTOB(bp->b_length) >= + ifp->if_bytes + sizeof(struct xfs_dsymlink_hdr)); + + bp->b_ops = &xfs_symlink_buf_ops; + + buf = bp->b_addr; + buf += xfs_symlink_hdr_set(mp, ip->i_ino, 0, ifp->if_bytes, bp); + memcpy(buf, ifp->if_u1.if_data, ifp->if_bytes); +} diff --git a/kernel/fs/xfs/libxfs/xfs_trans_resv.c b/kernel/fs/xfs/libxfs/xfs_trans_resv.c new file mode 100644 index 000000000..68cb1e7bf --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_trans_resv.c @@ -0,0 +1,878 @@ +/* + * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * Copyright (C) 2010 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_inode.h" +#include "xfs_bmap_btree.h" +#include "xfs_ialloc.h" +#include "xfs_quota.h" +#include "xfs_trans.h" +#include "xfs_qm.h" +#include "xfs_trans_space.h" +#include "xfs_trace.h" + +/* + * A buffer has a format structure overhead in the log in addition + * to the data, so we need to take this into account when reserving + * space in a transaction for a buffer. Round the space required up + * to a multiple of 128 bytes so that we don't change the historical + * reservation that has been used for this overhead. + */ +STATIC uint +xfs_buf_log_overhead(void) +{ + return round_up(sizeof(struct xlog_op_header) + + sizeof(struct xfs_buf_log_format), 128); +} + +/* + * Calculate out transaction log reservation per item in bytes. + * + * The nbufs argument is used to indicate the number of items that + * will be changed in a transaction. size is used to tell how many + * bytes should be reserved per item. + */ +STATIC uint +xfs_calc_buf_res( + uint nbufs, + uint size) +{ + return nbufs * (size + xfs_buf_log_overhead()); +} + +/* + * Logging inodes is really tricksy. They are logged in memory format, + * which means that what we write into the log doesn't directly translate into + * the amount of space they use on disk. + * + * Case in point - btree format forks in memory format use more space than the + * on-disk format. In memory, the buffer contains a normal btree block header so + * the btree code can treat it as though it is just another generic buffer. + * However, when we write it to the inode fork, we don't write all of this + * header as it isn't needed. e.g. the root is only ever in the inode, so + * there's no need for sibling pointers which would waste 16 bytes of space. + * + * Hence when we have an inode with a maximally sized btree format fork, then + * amount of information we actually log is greater than the size of the inode + * on disk. Hence we need an inode reservation function that calculates all this + * correctly. So, we log: + * + * - 4 log op headers for object + * - for the ilf, the inode core and 2 forks + * - inode log format object + * - the inode core + * - two inode forks containing bmap btree root blocks. + * - the btree data contained by both forks will fit into the inode size, + * hence when combined with the inode core above, we have a total of the + * actual inode size. + * - the BMBT headers need to be accounted separately, as they are + * additional to the records and pointers that fit inside the inode + * forks. + */ +STATIC uint +xfs_calc_inode_res( + struct xfs_mount *mp, + uint ninodes) +{ + return ninodes * + (4 * sizeof(struct xlog_op_header) + + sizeof(struct xfs_inode_log_format) + + mp->m_sb.sb_inodesize + + 2 * XFS_BMBT_BLOCK_LEN(mp)); +} + +/* + * The free inode btree is a conditional feature and the log reservation + * requirements differ slightly from that of the traditional inode allocation + * btree. The finobt tracks records for inode chunks with at least one free + * inode. A record can be removed from the tree for an inode allocation + * or free and thus the finobt reservation is unconditional across: + * + * - inode allocation + * - inode free + * - inode chunk allocation + * + * The 'modify' param indicates to include the record modification scenario. The + * 'alloc' param indicates to include the reservation for free space btree + * modifications on behalf of finobt modifications. This is required only for + * transactions that do not already account for free space btree modifications. + * + * the free inode btree: max depth * block size + * the allocation btrees: 2 trees * (max depth - 1) * block size + * the free inode btree entry: block size + */ +STATIC uint +xfs_calc_finobt_res( + struct xfs_mount *mp, + int alloc, + int modify) +{ + uint res; + + if (!xfs_sb_version_hasfinobt(&mp->m_sb)) + return 0; + + res = xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)); + if (alloc) + res += xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + XFS_FSB_TO_B(mp, 1)); + if (modify) + res += (uint)XFS_FSB_TO_B(mp, 1); + + return res; +} + +/* + * Various log reservation values. + * + * These are based on the size of the file system block because that is what + * most transactions manipulate. Each adds in an additional 128 bytes per + * item logged to try to account for the overhead of the transaction mechanism. + * + * Note: Most of the reservations underestimate the number of allocation + * groups into which they could free extents in the xfs_bmap_finish() call. + * This is because the number in the worst case is quite high and quite + * unusual. In order to fix this we need to change xfs_bmap_finish() to free + * extents in only a single AG at a time. This will require changes to the + * EFI code as well, however, so that the EFI for the extents not freed is + * logged again in each transaction. See SGI PV #261917. + * + * Reservation functions here avoid a huge stack in xfs_trans_init due to + * register overflow from temporaries in the calculations. + */ + + +/* + * In a write transaction we can allocate a maximum of 2 + * extents. This gives: + * the inode getting the new extents: inode size + * the inode's bmap btree: max depth * block size + * the agfs of the ags from which the extents are allocated: 2 * sector + * the superblock free block counter: sector size + * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size + * And the bmap_finish transaction can free bmap blocks in a join: + * the agfs of the ags containing the blocks: 2 * sector size + * the agfls of the ags containing the blocks: 2 * sector size + * the super block free block counter: sector size + * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size + */ +STATIC uint +xfs_calc_write_reservation( + struct xfs_mount *mp) +{ + return XFS_DQUOT_LOGRES(mp) + + MAX((xfs_calc_inode_res(mp, 1) + + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), + XFS_FSB_TO_B(mp, 1)) + + xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2), + XFS_FSB_TO_B(mp, 1))), + (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2), + XFS_FSB_TO_B(mp, 1)))); +} + +/* + * In truncating a file we free up to two extents at once. We can modify: + * the inode being truncated: inode size + * the inode's bmap btree: (max depth + 1) * block size + * And the bmap_finish transaction can free the blocks and bmap blocks: + * the agf for each of the ags: 4 * sector size + * the agfl for each of the ags: 4 * sector size + * the super block to reflect the freed blocks: sector size + * worst case split in allocation btrees per extent assuming 4 extents: + * 4 exts * 2 trees * (2 * max depth - 1) * block size + * the inode btree: max depth * blocksize + * the allocation btrees: 2 trees * (max depth - 1) * block size + */ +STATIC uint +xfs_calc_itruncate_reservation( + struct xfs_mount *mp) +{ + return XFS_DQUOT_LOGRES(mp) + + MAX((xfs_calc_inode_res(mp, 1) + + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1, + XFS_FSB_TO_B(mp, 1))), + (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 4), + XFS_FSB_TO_B(mp, 1)) + + xfs_calc_buf_res(5, 0) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + XFS_FSB_TO_B(mp, 1)) + + xfs_calc_buf_res(2 + mp->m_ialloc_blks + + mp->m_in_maxlevels, 0))); +} + +/* + * In renaming a files we can modify: + * the four inodes involved: 4 * inode size + * the two directory btrees: 2 * (max depth + v2) * dir block size + * the two directory bmap btrees: 2 * max depth * block size + * And the bmap_finish transaction can free dir and bmap blocks (two sets + * of bmap blocks) giving: + * the agf for the ags in which the blocks live: 3 * sector size + * the agfl for the ags in which the blocks live: 3 * sector size + * the superblock for the free block count: sector size + * the allocation btrees: 3 exts * 2 trees * (2 * max depth - 1) * block size + */ +STATIC uint +xfs_calc_rename_reservation( + struct xfs_mount *mp) +{ + return XFS_DQUOT_LOGRES(mp) + + MAX((xfs_calc_inode_res(mp, 4) + + xfs_calc_buf_res(2 * XFS_DIROP_LOG_COUNT(mp), + XFS_FSB_TO_B(mp, 1))), + (xfs_calc_buf_res(7, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 3), + XFS_FSB_TO_B(mp, 1)))); +} + +/* + * For removing an inode from unlinked list at first, we can modify: + * the agi hash list and counters: sector size + * the on disk inode before ours in the agi hash list: inode cluster size + */ +STATIC uint +xfs_calc_iunlink_remove_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + + max_t(uint, XFS_FSB_TO_B(mp, 1), mp->m_inode_cluster_size); +} + +/* + * For creating a link to an inode: + * the parent directory inode: inode size + * the linked inode: inode size + * the directory btree could split: (max depth + v2) * dir block size + * the directory bmap btree could join or split: (max depth + v2) * blocksize + * And the bmap_finish transaction can free some bmap blocks giving: + * the agf for the ag in which the blocks live: sector size + * the agfl for the ag in which the blocks live: sector size + * the superblock for the free block count: sector size + * the allocation btrees: 2 trees * (2 * max depth - 1) * block size + */ +STATIC uint +xfs_calc_link_reservation( + struct xfs_mount *mp) +{ + return XFS_DQUOT_LOGRES(mp) + + xfs_calc_iunlink_remove_reservation(mp) + + MAX((xfs_calc_inode_res(mp, 2) + + xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), + XFS_FSB_TO_B(mp, 1))), + (xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + XFS_FSB_TO_B(mp, 1)))); +} + +/* + * For adding an inode to unlinked list we can modify: + * the agi hash list: sector size + * the unlinked inode: inode size + */ +STATIC uint +xfs_calc_iunlink_add_reservation(xfs_mount_t *mp) +{ + return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + + xfs_calc_inode_res(mp, 1); +} + +/* + * For removing a directory entry we can modify: + * the parent directory inode: inode size + * the removed inode: inode size + * the directory btree could join: (max depth + v2) * dir block size + * the directory bmap btree could join or split: (max depth + v2) * blocksize + * And the bmap_finish transaction can free the dir and bmap blocks giving: + * the agf for the ag in which the blocks live: 2 * sector size + * the agfl for the ag in which the blocks live: 2 * sector size + * the superblock for the free block count: sector size + * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size + */ +STATIC uint +xfs_calc_remove_reservation( + struct xfs_mount *mp) +{ + return XFS_DQUOT_LOGRES(mp) + + xfs_calc_iunlink_add_reservation(mp) + + MAX((xfs_calc_inode_res(mp, 1) + + xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), + XFS_FSB_TO_B(mp, 1))), + (xfs_calc_buf_res(4, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2), + XFS_FSB_TO_B(mp, 1)))); +} + +/* + * For create, break it in to the two cases that the transaction + * covers. We start with the modify case - allocation done by modification + * of the state of existing inodes - and the allocation case. + */ + +/* + * For create we can modify: + * the parent directory inode: inode size + * the new inode: inode size + * the inode btree entry: block size + * the superblock for the nlink flag: sector size + * the directory btree: (max depth + v2) * dir block size + * the directory inode's bmap btree: (max depth + v2) * block size + * the finobt (record modification and allocation btrees) + */ +STATIC uint +xfs_calc_create_resv_modify( + struct xfs_mount *mp) +{ + return xfs_calc_inode_res(mp, 2) + + xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + + (uint)XFS_FSB_TO_B(mp, 1) + + xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1)) + + xfs_calc_finobt_res(mp, 1, 1); +} + +/* + * For create we can allocate some inodes giving: + * the agi and agf of the ag getting the new inodes: 2 * sectorsize + * the superblock for the nlink flag: sector size + * the inode blocks allocated: mp->m_ialloc_blks * blocksize + * the inode btree: max depth * blocksize + * the allocation btrees: 2 trees * (max depth - 1) * block size + */ +STATIC uint +xfs_calc_create_resv_alloc( + struct xfs_mount *mp) +{ + return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + + mp->m_sb.sb_sectsize + + xfs_calc_buf_res(mp->m_ialloc_blks, XFS_FSB_TO_B(mp, 1)) + + xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + XFS_FSB_TO_B(mp, 1)); +} + +STATIC uint +__xfs_calc_create_reservation( + struct xfs_mount *mp) +{ + return XFS_DQUOT_LOGRES(mp) + + MAX(xfs_calc_create_resv_alloc(mp), + xfs_calc_create_resv_modify(mp)); +} + +/* + * For icreate we can allocate some inodes giving: + * the agi and agf of the ag getting the new inodes: 2 * sectorsize + * the superblock for the nlink flag: sector size + * the inode btree: max depth * blocksize + * the allocation btrees: 2 trees * (max depth - 1) * block size + * the finobt (record insertion) + */ +STATIC uint +xfs_calc_icreate_resv_alloc( + struct xfs_mount *mp) +{ + return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + + mp->m_sb.sb_sectsize + + xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + XFS_FSB_TO_B(mp, 1)) + + xfs_calc_finobt_res(mp, 0, 0); +} + +STATIC uint +xfs_calc_icreate_reservation(xfs_mount_t *mp) +{ + return XFS_DQUOT_LOGRES(mp) + + MAX(xfs_calc_icreate_resv_alloc(mp), + xfs_calc_create_resv_modify(mp)); +} + +STATIC uint +xfs_calc_create_reservation( + struct xfs_mount *mp) +{ + if (xfs_sb_version_hascrc(&mp->m_sb)) + return xfs_calc_icreate_reservation(mp); + return __xfs_calc_create_reservation(mp); + +} + +STATIC uint +xfs_calc_create_tmpfile_reservation( + struct xfs_mount *mp) +{ + uint res = XFS_DQUOT_LOGRES(mp); + + if (xfs_sb_version_hascrc(&mp->m_sb)) + res += xfs_calc_icreate_resv_alloc(mp); + else + res += xfs_calc_create_resv_alloc(mp); + + return res + xfs_calc_iunlink_add_reservation(mp); +} + +/* + * Making a new directory is the same as creating a new file. + */ +STATIC uint +xfs_calc_mkdir_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_create_reservation(mp); +} + + +/* + * Making a new symplink is the same as creating a new file, but + * with the added blocks for remote symlink data which can be up to 1kB in + * length (MAXPATHLEN). + */ +STATIC uint +xfs_calc_symlink_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_create_reservation(mp) + + xfs_calc_buf_res(1, MAXPATHLEN); +} + +/* + * In freeing an inode we can modify: + * the inode being freed: inode size + * the super block free inode counter: sector size + * the agi hash list and counters: sector size + * the inode btree entry: block size + * the on disk inode before ours in the agi hash list: inode cluster size + * the inode btree: max depth * blocksize + * the allocation btrees: 2 trees * (max depth - 1) * block size + * the finobt (record insertion, removal or modification) + */ +STATIC uint +xfs_calc_ifree_reservation( + struct xfs_mount *mp) +{ + return XFS_DQUOT_LOGRES(mp) + + xfs_calc_inode_res(mp, 1) + + xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, 1)) + + xfs_calc_iunlink_remove_reservation(mp) + + xfs_calc_buf_res(1, 0) + + xfs_calc_buf_res(2 + mp->m_ialloc_blks + + mp->m_in_maxlevels, 0) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + XFS_FSB_TO_B(mp, 1)) + + xfs_calc_finobt_res(mp, 0, 1); +} + +/* + * When only changing the inode we log the inode and possibly the superblock + * We also add a bit of slop for the transaction stuff. + */ +STATIC uint +xfs_calc_ichange_reservation( + struct xfs_mount *mp) +{ + return XFS_DQUOT_LOGRES(mp) + + xfs_calc_inode_res(mp, 1) + + xfs_calc_buf_res(1, mp->m_sb.sb_sectsize); + +} + +/* + * Growing the data section of the filesystem. + * superblock + * agi and agf + * allocation btrees + */ +STATIC uint +xfs_calc_growdata_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + XFS_FSB_TO_B(mp, 1)); +} + +/* + * Growing the rt section of the filesystem. + * In the first set of transactions (ALLOC) we allocate space to the + * bitmap or summary files. + * superblock: sector size + * agf of the ag from which the extent is allocated: sector size + * bmap btree for bitmap/summary inode: max depth * blocksize + * bitmap/summary inode: inode size + * allocation btrees for 1 block alloc: 2 * (2 * maxdepth - 1) * blocksize + */ +STATIC uint +xfs_calc_growrtalloc_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), + XFS_FSB_TO_B(mp, 1)) + + xfs_calc_inode_res(mp, 1) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + XFS_FSB_TO_B(mp, 1)); +} + +/* + * Growing the rt section of the filesystem. + * In the second set of transactions (ZERO) we zero the new metadata blocks. + * one bitmap/summary block: blocksize + */ +STATIC uint +xfs_calc_growrtzero_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_buf_res(1, mp->m_sb.sb_blocksize); +} + +/* + * Growing the rt section of the filesystem. + * In the third set of transactions (FREE) we update metadata without + * allocating any new blocks. + * superblock: sector size + * bitmap inode: inode size + * summary inode: inode size + * one bitmap block: blocksize + * summary blocks: new summary size + */ +STATIC uint +xfs_calc_growrtfree_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + + xfs_calc_inode_res(mp, 2) + + xfs_calc_buf_res(1, mp->m_sb.sb_blocksize) + + xfs_calc_buf_res(1, mp->m_rsumsize); +} + +/* + * Logging the inode modification timestamp on a synchronous write. + * inode + */ +STATIC uint +xfs_calc_swrite_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_inode_res(mp, 1); +} + +/* + * Logging the inode mode bits when writing a setuid/setgid file + * inode + */ +STATIC uint +xfs_calc_writeid_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_inode_res(mp, 1); +} + +/* + * Converting the inode from non-attributed to attributed. + * the inode being converted: inode size + * agf block and superblock (for block allocation) + * the new block (directory sized) + * bmap blocks for the new directory block + * allocation btrees + */ +STATIC uint +xfs_calc_addafork_reservation( + struct xfs_mount *mp) +{ + return XFS_DQUOT_LOGRES(mp) + + xfs_calc_inode_res(mp, 1) + + xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(1, mp->m_dir_geo->blksize) + + xfs_calc_buf_res(XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1, + XFS_FSB_TO_B(mp, 1)) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + XFS_FSB_TO_B(mp, 1)); +} + +/* + * Removing the attribute fork of a file + * the inode being truncated: inode size + * the inode's bmap btree: max depth * block size + * And the bmap_finish transaction can free the blocks and bmap blocks: + * the agf for each of the ags: 4 * sector size + * the agfl for each of the ags: 4 * sector size + * the super block to reflect the freed blocks: sector size + * worst case split in allocation btrees per extent assuming 4 extents: + * 4 exts * 2 trees * (2 * max depth - 1) * block size + */ +STATIC uint +xfs_calc_attrinval_reservation( + struct xfs_mount *mp) +{ + return MAX((xfs_calc_inode_res(mp, 1) + + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK), + XFS_FSB_TO_B(mp, 1))), + (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 4), + XFS_FSB_TO_B(mp, 1)))); +} + +/* + * Setting an attribute at mount time. + * the inode getting the attribute + * the superblock for allocations + * the agfs extents are allocated from + * the attribute btree * max depth + * the inode allocation btree + * Since attribute transaction space is dependent on the size of the attribute, + * the calculation is done partially at mount time and partially at runtime(see + * below). + */ +STATIC uint +xfs_calc_attrsetm_reservation( + struct xfs_mount *mp) +{ + return XFS_DQUOT_LOGRES(mp) + + xfs_calc_inode_res(mp, 1) + + xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_DA_NODE_MAXDEPTH, XFS_FSB_TO_B(mp, 1)); +} + +/* + * Setting an attribute at runtime, transaction space unit per block. + * the superblock for allocations: sector size + * the inode bmap btree could join or split: max depth * block size + * Since the runtime attribute transaction space is dependent on the total + * blocks needed for the 1st bmap, here we calculate out the space unit for + * one block so that the caller could figure out the total space according + * to the attibute extent length in blocks by: + * ext * M_RES(mp)->tr_attrsetrt.tr_logres + */ +STATIC uint +xfs_calc_attrsetrt_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK), + XFS_FSB_TO_B(mp, 1)); +} + +/* + * Removing an attribute. + * the inode: inode size + * the attribute btree could join: max depth * block size + * the inode bmap btree could join or split: max depth * block size + * And the bmap_finish transaction can free the attr blocks freed giving: + * the agf for the ag in which the blocks live: 2 * sector size + * the agfl for the ag in which the blocks live: 2 * sector size + * the superblock for the free block count: sector size + * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size + */ +STATIC uint +xfs_calc_attrrm_reservation( + struct xfs_mount *mp) +{ + return XFS_DQUOT_LOGRES(mp) + + MAX((xfs_calc_inode_res(mp, 1) + + xfs_calc_buf_res(XFS_DA_NODE_MAXDEPTH, + XFS_FSB_TO_B(mp, 1)) + + (uint)XFS_FSB_TO_B(mp, + XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) + + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), 0)), + (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2), + XFS_FSB_TO_B(mp, 1)))); +} + +/* + * Clearing a bad agino number in an agi hash bucket. + */ +STATIC uint +xfs_calc_clear_agi_bucket_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize); +} + +/* + * Adjusting quota limits. + * the xfs_disk_dquot_t: sizeof(struct xfs_disk_dquot) + */ +STATIC uint +xfs_calc_qm_setqlim_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_buf_res(1, sizeof(struct xfs_disk_dquot)); +} + +/* + * Allocating quota on disk if needed. + * the write transaction log space for quota file extent allocation + * the unit of quota allocation: one system block size + */ +STATIC uint +xfs_calc_qm_dqalloc_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_write_reservation(mp) + + xfs_calc_buf_res(1, + XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB) - 1); +} + +/* + * Turning off quotas. + * the xfs_qoff_logitem_t: sizeof(struct xfs_qoff_logitem) * 2 + * the superblock for the quota flags: sector size + */ +STATIC uint +xfs_calc_qm_quotaoff_reservation( + struct xfs_mount *mp) +{ + return sizeof(struct xfs_qoff_logitem) * 2 + + xfs_calc_buf_res(1, mp->m_sb.sb_sectsize); +} + +/* + * End of turning off quotas. + * the xfs_qoff_logitem_t: sizeof(struct xfs_qoff_logitem) * 2 + */ +STATIC uint +xfs_calc_qm_quotaoff_end_reservation( + struct xfs_mount *mp) +{ + return sizeof(struct xfs_qoff_logitem) * 2; +} + +/* + * Syncing the incore super block changes to disk. + * the super block to reflect the changes: sector size + */ +STATIC uint +xfs_calc_sb_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize); +} + +void +xfs_trans_resv_calc( + struct xfs_mount *mp, + struct xfs_trans_resv *resp) +{ + /* + * The following transactions are logged in physical format and + * require a permanent reservation on space. + */ + resp->tr_write.tr_logres = xfs_calc_write_reservation(mp); + resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT; + resp->tr_write.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + + resp->tr_itruncate.tr_logres = xfs_calc_itruncate_reservation(mp); + resp->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT; + resp->tr_itruncate.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + + resp->tr_rename.tr_logres = xfs_calc_rename_reservation(mp); + resp->tr_rename.tr_logcount = XFS_RENAME_LOG_COUNT; + resp->tr_rename.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + + resp->tr_link.tr_logres = xfs_calc_link_reservation(mp); + resp->tr_link.tr_logcount = XFS_LINK_LOG_COUNT; + resp->tr_link.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + + resp->tr_remove.tr_logres = xfs_calc_remove_reservation(mp); + resp->tr_remove.tr_logcount = XFS_REMOVE_LOG_COUNT; + resp->tr_remove.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + + resp->tr_symlink.tr_logres = xfs_calc_symlink_reservation(mp); + resp->tr_symlink.tr_logcount = XFS_SYMLINK_LOG_COUNT; + resp->tr_symlink.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + + resp->tr_create.tr_logres = xfs_calc_create_reservation(mp); + resp->tr_create.tr_logcount = XFS_CREATE_LOG_COUNT; + resp->tr_create.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + + resp->tr_create_tmpfile.tr_logres = + xfs_calc_create_tmpfile_reservation(mp); + resp->tr_create_tmpfile.tr_logcount = XFS_CREATE_TMPFILE_LOG_COUNT; + resp->tr_create_tmpfile.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + + resp->tr_mkdir.tr_logres = xfs_calc_mkdir_reservation(mp); + resp->tr_mkdir.tr_logcount = XFS_MKDIR_LOG_COUNT; + resp->tr_mkdir.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + + resp->tr_ifree.tr_logres = xfs_calc_ifree_reservation(mp); + resp->tr_ifree.tr_logcount = XFS_INACTIVE_LOG_COUNT; + resp->tr_ifree.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + + resp->tr_addafork.tr_logres = xfs_calc_addafork_reservation(mp); + resp->tr_addafork.tr_logcount = XFS_ADDAFORK_LOG_COUNT; + resp->tr_addafork.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + + resp->tr_attrinval.tr_logres = xfs_calc_attrinval_reservation(mp); + resp->tr_attrinval.tr_logcount = XFS_ATTRINVAL_LOG_COUNT; + resp->tr_attrinval.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + + resp->tr_attrsetm.tr_logres = xfs_calc_attrsetm_reservation(mp); + resp->tr_attrsetm.tr_logcount = XFS_ATTRSET_LOG_COUNT; + resp->tr_attrsetm.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + + resp->tr_attrrm.tr_logres = xfs_calc_attrrm_reservation(mp); + resp->tr_attrrm.tr_logcount = XFS_ATTRRM_LOG_COUNT; + resp->tr_attrrm.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + + resp->tr_growrtalloc.tr_logres = xfs_calc_growrtalloc_reservation(mp); + resp->tr_growrtalloc.tr_logcount = XFS_DEFAULT_PERM_LOG_COUNT; + resp->tr_growrtalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + + resp->tr_qm_dqalloc.tr_logres = xfs_calc_qm_dqalloc_reservation(mp); + resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT; + resp->tr_qm_dqalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + + /* + * The following transactions are logged in logical format with + * a default log count. + */ + resp->tr_qm_setqlim.tr_logres = xfs_calc_qm_setqlim_reservation(mp); + resp->tr_qm_setqlim.tr_logcount = XFS_DEFAULT_LOG_COUNT; + + resp->tr_qm_quotaoff.tr_logres = xfs_calc_qm_quotaoff_reservation(mp); + resp->tr_qm_quotaoff.tr_logcount = XFS_DEFAULT_LOG_COUNT; + + resp->tr_qm_equotaoff.tr_logres = + xfs_calc_qm_quotaoff_end_reservation(mp); + resp->tr_qm_equotaoff.tr_logcount = XFS_DEFAULT_LOG_COUNT; + + resp->tr_sb.tr_logres = xfs_calc_sb_reservation(mp); + resp->tr_sb.tr_logcount = XFS_DEFAULT_LOG_COUNT; + + /* The following transaction are logged in logical format */ + resp->tr_ichange.tr_logres = xfs_calc_ichange_reservation(mp); + resp->tr_growdata.tr_logres = xfs_calc_growdata_reservation(mp); + resp->tr_fsyncts.tr_logres = xfs_calc_swrite_reservation(mp); + resp->tr_writeid.tr_logres = xfs_calc_writeid_reservation(mp); + resp->tr_attrsetrt.tr_logres = xfs_calc_attrsetrt_reservation(mp); + resp->tr_clearagi.tr_logres = xfs_calc_clear_agi_bucket_reservation(mp); + resp->tr_growrtzero.tr_logres = xfs_calc_growrtzero_reservation(mp); + resp->tr_growrtfree.tr_logres = xfs_calc_growrtfree_reservation(mp); +} diff --git a/kernel/fs/xfs/libxfs/xfs_trans_resv.h b/kernel/fs/xfs/libxfs/xfs_trans_resv.h new file mode 100644 index 000000000..2d5bdfce6 --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_trans_resv.h @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_TRANS_RESV_H__ +#define __XFS_TRANS_RESV_H__ + +struct xfs_mount; + +/* + * structure for maintaining pre-calculated transaction reservations. + */ +struct xfs_trans_res { + uint tr_logres; /* log space unit in bytes per log ticket */ + int tr_logcount; /* number of log operations per log ticket */ + int tr_logflags; /* log flags, currently only used for indicating + * a reservation request is permanent or not */ +}; + +struct xfs_trans_resv { + struct xfs_trans_res tr_write; /* extent alloc trans */ + struct xfs_trans_res tr_itruncate; /* truncate trans */ + struct xfs_trans_res tr_rename; /* rename trans */ + struct xfs_trans_res tr_link; /* link trans */ + struct xfs_trans_res tr_remove; /* unlink trans */ + struct xfs_trans_res tr_symlink; /* symlink trans */ + struct xfs_trans_res tr_create; /* create trans */ + struct xfs_trans_res tr_create_tmpfile; /* create O_TMPFILE trans */ + struct xfs_trans_res tr_mkdir; /* mkdir trans */ + struct xfs_trans_res tr_ifree; /* inode free trans */ + struct xfs_trans_res tr_ichange; /* inode update trans */ + struct xfs_trans_res tr_growdata; /* fs data section grow trans */ + struct xfs_trans_res tr_addafork; /* add inode attr fork trans */ + struct xfs_trans_res tr_writeid; /* write setuid/setgid file */ + struct xfs_trans_res tr_attrinval; /* attr fork buffer + * invalidation */ + struct xfs_trans_res tr_attrsetm; /* set/create an attribute at + * mount time */ + struct xfs_trans_res tr_attrsetrt; /* set/create an attribute at + * runtime */ + struct xfs_trans_res tr_attrrm; /* remove an attribute */ + struct xfs_trans_res tr_clearagi; /* clear agi unlinked bucket */ + struct xfs_trans_res tr_growrtalloc; /* grow realtime allocations */ + struct xfs_trans_res tr_growrtzero; /* grow realtime zeroing */ + struct xfs_trans_res tr_growrtfree; /* grow realtime freeing */ + struct xfs_trans_res tr_qm_setqlim; /* adjust quota limits */ + struct xfs_trans_res tr_qm_dqalloc; /* allocate quota on disk */ + struct xfs_trans_res tr_qm_quotaoff; /* turn quota off */ + struct xfs_trans_res tr_qm_equotaoff;/* end of turn quota off */ + struct xfs_trans_res tr_sb; /* modify superblock */ + struct xfs_trans_res tr_fsyncts; /* update timestamps on fsync */ +}; + +/* shorthand way of accessing reservation structure */ +#define M_RES(mp) (&(mp)->m_resv) + +/* + * Per-extent log reservation for the allocation btree changes + * involved in freeing or allocating an extent. + * 2 trees * (2 blocks/level * max depth - 1) * block size + */ +#define XFS_ALLOCFREE_LOG_RES(mp,nx) \ + ((nx) * (2 * XFS_FSB_TO_B((mp), 2 * XFS_AG_MAXLEVELS(mp) - 1))) +#define XFS_ALLOCFREE_LOG_COUNT(mp,nx) \ + ((nx) * (2 * (2 * XFS_AG_MAXLEVELS(mp) - 1))) + +/* + * Per-directory log reservation for any directory change. + * dir blocks: (1 btree block per level + data block + free block) * dblock size + * bmap btree: (levels + 2) * max depth * block size + * v2 directory blocks can be fragmented below the dirblksize down to the fsb + * size, so account for that in the DAENTER macros. + */ +#define XFS_DIROP_LOG_RES(mp) \ + (XFS_FSB_TO_B(mp, XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK)) + \ + (XFS_FSB_TO_B(mp, XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1))) +#define XFS_DIROP_LOG_COUNT(mp) \ + (XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK) + \ + XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1) + +/* + * Various log count values. + */ +#define XFS_DEFAULT_LOG_COUNT 1 +#define XFS_DEFAULT_PERM_LOG_COUNT 2 +#define XFS_ITRUNCATE_LOG_COUNT 2 +#define XFS_INACTIVE_LOG_COUNT 2 +#define XFS_CREATE_LOG_COUNT 2 +#define XFS_CREATE_TMPFILE_LOG_COUNT 2 +#define XFS_MKDIR_LOG_COUNT 3 +#define XFS_SYMLINK_LOG_COUNT 3 +#define XFS_REMOVE_LOG_COUNT 2 +#define XFS_LINK_LOG_COUNT 2 +#define XFS_RENAME_LOG_COUNT 2 +#define XFS_WRITE_LOG_COUNT 2 +#define XFS_ADDAFORK_LOG_COUNT 2 +#define XFS_ATTRINVAL_LOG_COUNT 1 +#define XFS_ATTRSET_LOG_COUNT 3 +#define XFS_ATTRRM_LOG_COUNT 3 + +void xfs_trans_resv_calc(struct xfs_mount *mp, struct xfs_trans_resv *resp); + +#endif /* __XFS_TRANS_RESV_H__ */ diff --git a/kernel/fs/xfs/libxfs/xfs_trans_space.h b/kernel/fs/xfs/libxfs/xfs_trans_space.h new file mode 100644 index 000000000..bf9c45793 --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_trans_space.h @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2000,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_TRANS_SPACE_H__ +#define __XFS_TRANS_SPACE_H__ + +/* + * Components of space reservations. + */ +#define XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) \ + (((mp)->m_alloc_mxr[0]) - ((mp)->m_alloc_mnr[0])) +#define XFS_EXTENTADD_SPACE_RES(mp,w) (XFS_BM_MAXLEVELS(mp,w) - 1) +#define XFS_NEXTENTADD_SPACE_RES(mp,b,w)\ + (((b + XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) - 1) / \ + XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp)) * \ + XFS_EXTENTADD_SPACE_RES(mp,w)) +#define XFS_DAENTER_1B(mp,w) \ + ((w) == XFS_DATA_FORK ? (mp)->m_dir_geo->fsbcount : 1) +#define XFS_DAENTER_DBS(mp,w) \ + (XFS_DA_NODE_MAXDEPTH + (((w) == XFS_DATA_FORK) ? 2 : 0)) +#define XFS_DAENTER_BLOCKS(mp,w) \ + (XFS_DAENTER_1B(mp,w) * XFS_DAENTER_DBS(mp,w)) +#define XFS_DAENTER_BMAP1B(mp,w) \ + XFS_NEXTENTADD_SPACE_RES(mp, XFS_DAENTER_1B(mp, w), w) +#define XFS_DAENTER_BMAPS(mp,w) \ + (XFS_DAENTER_DBS(mp,w) * XFS_DAENTER_BMAP1B(mp,w)) +#define XFS_DAENTER_SPACE_RES(mp,w) \ + (XFS_DAENTER_BLOCKS(mp,w) + XFS_DAENTER_BMAPS(mp,w)) +#define XFS_DAREMOVE_SPACE_RES(mp,w) XFS_DAENTER_BMAPS(mp,w) +#define XFS_DIRENTER_MAX_SPLIT(mp,nl) 1 +#define XFS_DIRENTER_SPACE_RES(mp,nl) \ + (XFS_DAENTER_SPACE_RES(mp, XFS_DATA_FORK) * \ + XFS_DIRENTER_MAX_SPLIT(mp,nl)) +#define XFS_DIRREMOVE_SPACE_RES(mp) \ + XFS_DAREMOVE_SPACE_RES(mp, XFS_DATA_FORK) +#define XFS_IALLOC_SPACE_RES(mp) \ + ((mp)->m_ialloc_blks + \ + (xfs_sb_version_hasfinobt(&mp->m_sb) ? 2 : 1 * \ + ((mp)->m_in_maxlevels - 1))) + +/* + * Space reservation values for various transactions. + */ +#define XFS_ADDAFORK_SPACE_RES(mp) \ + ((mp)->m_dir_geo->fsbcount + XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK)) +#define XFS_ATTRRM_SPACE_RES(mp) \ + XFS_DAREMOVE_SPACE_RES(mp, XFS_ATTR_FORK) +/* This macro is not used - see inline code in xfs_attr_set */ +#define XFS_ATTRSET_SPACE_RES(mp, v) \ + (XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK) + XFS_B_TO_FSB(mp, v)) +#define XFS_CREATE_SPACE_RES(mp,nl) \ + (XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl)) +#define XFS_DIOSTRAT_SPACE_RES(mp, v) \ + (XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK) + (v)) +#define XFS_GROWFS_SPACE_RES(mp) \ + (2 * XFS_AG_MAXLEVELS(mp)) +#define XFS_GROWFSRT_SPACE_RES(mp,b) \ + ((b) + XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK)) +#define XFS_LINK_SPACE_RES(mp,nl) \ + XFS_DIRENTER_SPACE_RES(mp,nl) +#define XFS_MKDIR_SPACE_RES(mp,nl) \ + (XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl)) +#define XFS_QM_DQALLOC_SPACE_RES(mp) \ + (XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK) + \ + XFS_DQUOT_CLUSTER_SIZE_FSB) +#define XFS_QM_QINOCREATE_SPACE_RES(mp) \ + XFS_IALLOC_SPACE_RES(mp) +#define XFS_REMOVE_SPACE_RES(mp) \ + XFS_DIRREMOVE_SPACE_RES(mp) +#define XFS_RENAME_SPACE_RES(mp,nl) \ + (XFS_DIRREMOVE_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl)) +#define XFS_SYMLINK_SPACE_RES(mp,nl,b) \ + (XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl) + (b)) +#define XFS_IFREE_SPACE_RES(mp) \ + (xfs_sb_version_hasfinobt(&mp->m_sb) ? (mp)->m_in_maxlevels : 0) + + +#endif /* __XFS_TRANS_SPACE_H__ */ diff --git a/kernel/fs/xfs/libxfs/xfs_types.h b/kernel/fs/xfs/libxfs/xfs_types.h new file mode 100644 index 000000000..b79dc66b2 --- /dev/null +++ b/kernel/fs/xfs/libxfs/xfs_types.h @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_TYPES_H__ +#define __XFS_TYPES_H__ + +typedef __uint32_t prid_t; /* project ID */ + +typedef __uint32_t xfs_agblock_t; /* blockno in alloc. group */ +typedef __uint32_t xfs_agino_t; /* inode # within allocation grp */ +typedef __uint32_t xfs_extlen_t; /* extent length in blocks */ +typedef __uint32_t xfs_agnumber_t; /* allocation group number */ +typedef __int32_t xfs_extnum_t; /* # of extents in a file */ +typedef __int16_t xfs_aextnum_t; /* # extents in an attribute fork */ +typedef __int64_t xfs_fsize_t; /* bytes in a file */ +typedef __uint64_t xfs_ufsize_t; /* unsigned bytes in a file */ + +typedef __int32_t xfs_suminfo_t; /* type of bitmap summary info */ +typedef __int32_t xfs_rtword_t; /* word type for bitmap manipulations */ + +typedef __int64_t xfs_lsn_t; /* log sequence number */ +typedef __int32_t xfs_tid_t; /* transaction identifier */ + +typedef __uint32_t xfs_dablk_t; /* dir/attr block number (in file) */ +typedef __uint32_t xfs_dahash_t; /* dir/attr hash value */ + +typedef __uint64_t xfs_fsblock_t; /* blockno in filesystem (agno|agbno) */ +typedef __uint64_t xfs_rfsblock_t; /* blockno in filesystem (raw) */ +typedef __uint64_t xfs_rtblock_t; /* extent (block) in realtime area */ +typedef __uint64_t xfs_fileoff_t; /* block number in a file */ +typedef __uint64_t xfs_filblks_t; /* number of blocks in a file */ + +typedef __int64_t xfs_srtblock_t; /* signed version of xfs_rtblock_t */ +typedef __int64_t xfs_sfiloff_t; /* signed block number in a file */ + +/* + * Null values for the types. + */ +#define NULLFSBLOCK ((xfs_fsblock_t)-1) +#define NULLRFSBLOCK ((xfs_rfsblock_t)-1) +#define NULLRTBLOCK ((xfs_rtblock_t)-1) +#define NULLFILEOFF ((xfs_fileoff_t)-1) + +#define NULLAGBLOCK ((xfs_agblock_t)-1) +#define NULLAGNUMBER ((xfs_agnumber_t)-1) +#define NULLEXTNUM ((xfs_extnum_t)-1) + +#define NULLCOMMITLSN ((xfs_lsn_t)-1) + +#define NULLFSINO ((xfs_ino_t)-1) +#define NULLAGINO ((xfs_agino_t)-1) + +/* + * Max values for extlen, extnum, aextnum. + */ +#define MAXEXTLEN ((xfs_extlen_t)0x001fffff) /* 21 bits */ +#define MAXEXTNUM ((xfs_extnum_t)0x7fffffff) /* signed int */ +#define MAXAEXTNUM ((xfs_aextnum_t)0x7fff) /* signed short */ + +/* + * Minimum and maximum blocksize and sectorsize. + * The blocksize upper limit is pretty much arbitrary. + * The sectorsize upper limit is due to sizeof(sb_sectsize). + */ +#define XFS_MIN_BLOCKSIZE_LOG 9 /* i.e. 512 bytes */ +#define XFS_MAX_BLOCKSIZE_LOG 16 /* i.e. 65536 bytes */ +#define XFS_MIN_BLOCKSIZE (1 << XFS_MIN_BLOCKSIZE_LOG) +#define XFS_MAX_BLOCKSIZE (1 << XFS_MAX_BLOCKSIZE_LOG) +#define XFS_MIN_SECTORSIZE_LOG 9 /* i.e. 512 bytes */ +#define XFS_MAX_SECTORSIZE_LOG 15 /* i.e. 32768 bytes */ +#define XFS_MIN_SECTORSIZE (1 << XFS_MIN_SECTORSIZE_LOG) +#define XFS_MAX_SECTORSIZE (1 << XFS_MAX_SECTORSIZE_LOG) + +/* + * Inode fork identifiers. + */ +#define XFS_DATA_FORK 0 +#define XFS_ATTR_FORK 1 + +/* + * Min numbers of data/attr fork btree root pointers. + */ +#define MINDBTPTRS 3 +#define MINABTPTRS 2 + +/* + * MAXNAMELEN is the length (including the terminating null) of + * the longest permissible file (component) name. + */ +#define MAXNAMELEN 256 + +typedef enum { + XFS_LOOKUP_EQi, XFS_LOOKUP_LEi, XFS_LOOKUP_GEi +} xfs_lookup_t; + +typedef enum { + XFS_BTNUM_BNOi, XFS_BTNUM_CNTi, XFS_BTNUM_BMAPi, XFS_BTNUM_INOi, + XFS_BTNUM_FINOi, XFS_BTNUM_MAX +} xfs_btnum_t; + +struct xfs_name { + const unsigned char *name; + int len; + int type; +}; + +/* + * uid_t and gid_t are hard-coded to 32 bits in the inode. + * Hence, an 'id' in a dquot is 32 bits.. + */ +typedef __uint32_t xfs_dqid_t; + +/* + * Constants for bit manipulations. + */ +#define XFS_NBBYLOG 3 /* log2(NBBY) */ +#define XFS_WORDLOG 2 /* log2(sizeof(xfs_rtword_t)) */ +#define XFS_NBWORDLOG (XFS_NBBYLOG + XFS_WORDLOG) +#define XFS_NBWORD (1 << XFS_NBWORDLOG) +#define XFS_WORDMASK ((1 << XFS_WORDLOG) - 1) + + +#endif /* __XFS_TYPES_H__ */ diff --git a/kernel/fs/xfs/mrlock.h b/kernel/fs/xfs/mrlock.h new file mode 100644 index 000000000..e3c92d19e --- /dev/null +++ b/kernel/fs/xfs/mrlock.h @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_SUPPORT_MRLOCK_H__ +#define __XFS_SUPPORT_MRLOCK_H__ + +#include + +typedef struct { + struct rw_semaphore mr_lock; +#if defined(DEBUG) || defined(XFS_WARN) + int mr_writer; +#endif +} mrlock_t; + +#if defined(DEBUG) || defined(XFS_WARN) +#define mrinit(mrp, name) \ + do { (mrp)->mr_writer = 0; init_rwsem(&(mrp)->mr_lock); } while (0) +#else +#define mrinit(mrp, name) \ + do { init_rwsem(&(mrp)->mr_lock); } while (0) +#endif + +#define mrlock_init(mrp, t,n,s) mrinit(mrp, n) +#define mrfree(mrp) do { } while (0) + +static inline void mraccess_nested(mrlock_t *mrp, int subclass) +{ + down_read_nested(&mrp->mr_lock, subclass); +} + +static inline void mrupdate_nested(mrlock_t *mrp, int subclass) +{ + down_write_nested(&mrp->mr_lock, subclass); +#if defined(DEBUG) || defined(XFS_WARN) + mrp->mr_writer = 1; +#endif +} + +static inline int mrtryaccess(mrlock_t *mrp) +{ + return down_read_trylock(&mrp->mr_lock); +} + +static inline int mrtryupdate(mrlock_t *mrp) +{ + if (!down_write_trylock(&mrp->mr_lock)) + return 0; +#if defined(DEBUG) || defined(XFS_WARN) + mrp->mr_writer = 1; +#endif + return 1; +} + +static inline void mrunlock_excl(mrlock_t *mrp) +{ +#if defined(DEBUG) || defined(XFS_WARN) + mrp->mr_writer = 0; +#endif + up_write(&mrp->mr_lock); +} + +static inline void mrunlock_shared(mrlock_t *mrp) +{ + up_read(&mrp->mr_lock); +} + +static inline void mrdemote(mrlock_t *mrp) +{ +#if defined(DEBUG) || defined(XFS_WARN) + mrp->mr_writer = 0; +#endif + downgrade_write(&mrp->mr_lock); +} + +#endif /* __XFS_SUPPORT_MRLOCK_H__ */ diff --git a/kernel/fs/xfs/uuid.c b/kernel/fs/xfs/uuid.c new file mode 100644 index 000000000..b83f76b6d --- /dev/null +++ b/kernel/fs/xfs/uuid.c @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include + +/* IRIX interpretation of an uuid_t */ +typedef struct { + __be32 uu_timelow; + __be16 uu_timemid; + __be16 uu_timehi; + __be16 uu_clockseq; + __be16 uu_node[3]; +} xfs_uu_t; + +/* + * uuid_getnodeuniq - obtain the node unique fields of a UUID. + * + * This is not in any way a standard or condoned UUID function; + * it just something that's needed for user-level file handles. + */ +void +uuid_getnodeuniq(uuid_t *uuid, int fsid [2]) +{ + xfs_uu_t *uup = (xfs_uu_t *)uuid; + + fsid[0] = (be16_to_cpu(uup->uu_clockseq) << 16) | + be16_to_cpu(uup->uu_timemid); + fsid[1] = be32_to_cpu(uup->uu_timelow); +} + +int +uuid_is_nil(uuid_t *uuid) +{ + int i; + char *cp = (char *)uuid; + + if (uuid == NULL) + return 0; + /* implied check of version number here... */ + for (i = 0; i < sizeof *uuid; i++) + if (*cp++) return 0; /* not nil */ + return 1; /* is nil */ +} + +int +uuid_equal(uuid_t *uuid1, uuid_t *uuid2) +{ + return memcmp(uuid1, uuid2, sizeof(uuid_t)) ? 0 : 1; +} diff --git a/kernel/fs/xfs/uuid.h b/kernel/fs/xfs/uuid.h new file mode 100644 index 000000000..104db0f3b --- /dev/null +++ b/kernel/fs/xfs/uuid.h @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_SUPPORT_UUID_H__ +#define __XFS_SUPPORT_UUID_H__ + +typedef struct { + unsigned char __u_bits[16]; +} uuid_t; + +extern int uuid_is_nil(uuid_t *uuid); +extern int uuid_equal(uuid_t *uuid1, uuid_t *uuid2); +extern void uuid_getnodeuniq(uuid_t *uuid, int fsid [2]); + +static inline void +uuid_copy(uuid_t *dst, uuid_t *src) +{ + memcpy(dst, src, sizeof(uuid_t)); +} + +#endif /* __XFS_SUPPORT_UUID_H__ */ diff --git a/kernel/fs/xfs/xfs.h b/kernel/fs/xfs/xfs.h new file mode 100644 index 000000000..a742c47f7 --- /dev/null +++ b/kernel/fs/xfs/xfs.h @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_H__ +#define __XFS_H__ + +#ifdef CONFIG_XFS_DEBUG +#define STATIC +#define DEBUG 1 +#define XFS_BUF_LOCK_TRACKING 1 +#endif + +#ifdef CONFIG_XFS_WARN +#define XFS_WARN 1 +#endif + + +#include "xfs_linux.h" + +#endif /* __XFS_H__ */ diff --git a/kernel/fs/xfs/xfs_acl.c b/kernel/fs/xfs/xfs_acl.c new file mode 100644 index 000000000..4b641676f --- /dev/null +++ b/kernel/fs/xfs/xfs_acl.c @@ -0,0 +1,304 @@ +/* + * Copyright (c) 2008, Christoph Hellwig + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_acl.h" +#include "xfs_attr.h" +#include "xfs_trace.h" +#include +#include +#include + + +/* + * Locking scheme: + * - all ACL updates are protected by inode->i_mutex, which is taken before + * calling into this file. + */ + +STATIC struct posix_acl * +xfs_acl_from_disk( + struct xfs_acl *aclp, + int max_entries) +{ + struct posix_acl_entry *acl_e; + struct posix_acl *acl; + struct xfs_acl_entry *ace; + unsigned int count, i; + + count = be32_to_cpu(aclp->acl_cnt); + if (count > max_entries) + return ERR_PTR(-EFSCORRUPTED); + + acl = posix_acl_alloc(count, GFP_KERNEL); + if (!acl) + return ERR_PTR(-ENOMEM); + + for (i = 0; i < count; i++) { + acl_e = &acl->a_entries[i]; + ace = &aclp->acl_entry[i]; + + /* + * The tag is 32 bits on disk and 16 bits in core. + * + * Because every access to it goes through the core + * format first this is not a problem. + */ + acl_e->e_tag = be32_to_cpu(ace->ae_tag); + acl_e->e_perm = be16_to_cpu(ace->ae_perm); + + switch (acl_e->e_tag) { + case ACL_USER: + acl_e->e_uid = xfs_uid_to_kuid(be32_to_cpu(ace->ae_id)); + break; + case ACL_GROUP: + acl_e->e_gid = xfs_gid_to_kgid(be32_to_cpu(ace->ae_id)); + break; + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + break; + default: + goto fail; + } + } + return acl; + +fail: + posix_acl_release(acl); + return ERR_PTR(-EINVAL); +} + +STATIC void +xfs_acl_to_disk(struct xfs_acl *aclp, const struct posix_acl *acl) +{ + const struct posix_acl_entry *acl_e; + struct xfs_acl_entry *ace; + int i; + + aclp->acl_cnt = cpu_to_be32(acl->a_count); + for (i = 0; i < acl->a_count; i++) { + ace = &aclp->acl_entry[i]; + acl_e = &acl->a_entries[i]; + + ace->ae_tag = cpu_to_be32(acl_e->e_tag); + switch (acl_e->e_tag) { + case ACL_USER: + ace->ae_id = cpu_to_be32(xfs_kuid_to_uid(acl_e->e_uid)); + break; + case ACL_GROUP: + ace->ae_id = cpu_to_be32(xfs_kgid_to_gid(acl_e->e_gid)); + break; + default: + ace->ae_id = cpu_to_be32(ACL_UNDEFINED_ID); + break; + } + + ace->ae_perm = cpu_to_be16(acl_e->e_perm); + } +} + +struct posix_acl * +xfs_get_acl(struct inode *inode, int type) +{ + struct xfs_inode *ip = XFS_I(inode); + struct posix_acl *acl = NULL; + struct xfs_acl *xfs_acl; + unsigned char *ea_name; + int error; + int len; + + trace_xfs_get_acl(ip); + + switch (type) { + case ACL_TYPE_ACCESS: + ea_name = SGI_ACL_FILE; + break; + case ACL_TYPE_DEFAULT: + ea_name = SGI_ACL_DEFAULT; + break; + default: + BUG(); + } + + /* + * If we have a cached ACLs value just return it, not need to + * go out to the disk. + */ + len = XFS_ACL_MAX_SIZE(ip->i_mount); + xfs_acl = kmem_zalloc_large(len, KM_SLEEP); + if (!xfs_acl) + return ERR_PTR(-ENOMEM); + + error = xfs_attr_get(ip, ea_name, (unsigned char *)xfs_acl, + &len, ATTR_ROOT); + if (error) { + /* + * If the attribute doesn't exist make sure we have a negative + * cache entry, for any other error assume it is transient and + * leave the cache entry as ACL_NOT_CACHED. + */ + if (error == -ENOATTR) + goto out_update_cache; + goto out; + } + + acl = xfs_acl_from_disk(xfs_acl, XFS_ACL_MAX_ENTRIES(ip->i_mount)); + if (IS_ERR(acl)) + goto out; + +out_update_cache: + set_cached_acl(inode, type, acl); +out: + kmem_free(xfs_acl); + return acl; +} + +STATIC int +__xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl) +{ + struct xfs_inode *ip = XFS_I(inode); + unsigned char *ea_name; + int error; + + switch (type) { + case ACL_TYPE_ACCESS: + ea_name = SGI_ACL_FILE; + break; + case ACL_TYPE_DEFAULT: + if (!S_ISDIR(inode->i_mode)) + return acl ? -EACCES : 0; + ea_name = SGI_ACL_DEFAULT; + break; + default: + return -EINVAL; + } + + if (acl) { + struct xfs_acl *xfs_acl; + int len = XFS_ACL_MAX_SIZE(ip->i_mount); + + xfs_acl = kmem_zalloc_large(len, KM_SLEEP); + if (!xfs_acl) + return -ENOMEM; + + xfs_acl_to_disk(xfs_acl, acl); + + /* subtract away the unused acl entries */ + len -= sizeof(struct xfs_acl_entry) * + (XFS_ACL_MAX_ENTRIES(ip->i_mount) - acl->a_count); + + error = xfs_attr_set(ip, ea_name, (unsigned char *)xfs_acl, + len, ATTR_ROOT); + + kmem_free(xfs_acl); + } else { + /* + * A NULL ACL argument means we want to remove the ACL. + */ + error = xfs_attr_remove(ip, ea_name, ATTR_ROOT); + + /* + * If the attribute didn't exist to start with that's fine. + */ + if (error == -ENOATTR) + error = 0; + } + + if (!error) + set_cached_acl(inode, type, acl); + return error; +} + +static int +xfs_set_mode(struct inode *inode, umode_t mode) +{ + int error = 0; + + if (mode != inode->i_mode) { + struct iattr iattr; + + iattr.ia_valid = ATTR_MODE | ATTR_CTIME; + iattr.ia_mode = mode; + iattr.ia_ctime = current_fs_time(inode->i_sb); + + error = xfs_setattr_nonsize(XFS_I(inode), &iattr, XFS_ATTR_NOACL); + } + + return error; +} + +static int +xfs_acl_exists(struct inode *inode, unsigned char *name) +{ + int len = XFS_ACL_MAX_SIZE(XFS_M(inode->i_sb)); + + return (xfs_attr_get(XFS_I(inode), name, NULL, &len, + ATTR_ROOT|ATTR_KERNOVAL) == 0); +} + +int +posix_acl_access_exists(struct inode *inode) +{ + return xfs_acl_exists(inode, SGI_ACL_FILE); +} + +int +posix_acl_default_exists(struct inode *inode) +{ + if (!S_ISDIR(inode->i_mode)) + return 0; + return xfs_acl_exists(inode, SGI_ACL_DEFAULT); +} + +int +xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) +{ + int error = 0; + + if (!acl) + goto set_acl; + + error = -E2BIG; + if (acl->a_count > XFS_ACL_MAX_ENTRIES(XFS_M(inode->i_sb))) + return error; + + if (type == ACL_TYPE_ACCESS) { + umode_t mode = inode->i_mode; + error = posix_acl_equiv_mode(acl, &mode); + + if (error <= 0) { + acl = NULL; + + if (error < 0) + return error; + } + + error = xfs_set_mode(inode, mode); + if (error) + return error; + } + + set_acl: + return __xfs_set_acl(inode, type, acl); +} diff --git a/kernel/fs/xfs/xfs_acl.h b/kernel/fs/xfs/xfs_acl.h new file mode 100644 index 000000000..3841b07f2 --- /dev/null +++ b/kernel/fs/xfs/xfs_acl.h @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2001-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_ACL_H__ +#define __XFS_ACL_H__ + +struct inode; +struct posix_acl; +struct xfs_inode; + +#ifdef CONFIG_XFS_POSIX_ACL +extern struct posix_acl *xfs_get_acl(struct inode *inode, int type); +extern int xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type); +extern int posix_acl_access_exists(struct inode *inode); +extern int posix_acl_default_exists(struct inode *inode); +#else +static inline struct posix_acl *xfs_get_acl(struct inode *inode, int type) +{ + return NULL; +} +# define xfs_set_acl NULL +# define posix_acl_access_exists(inode) 0 +# define posix_acl_default_exists(inode) 0 +#endif /* CONFIG_XFS_POSIX_ACL */ +#endif /* __XFS_ACL_H__ */ diff --git a/kernel/fs/xfs/xfs_aops.c b/kernel/fs/xfs/xfs_aops.c new file mode 100644 index 000000000..a56960dd1 --- /dev/null +++ b/kernel/fs/xfs/xfs_aops.c @@ -0,0 +1,1931 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_inode_item.h" +#include "xfs_alloc.h" +#include "xfs_error.h" +#include "xfs_iomap.h" +#include "xfs_trace.h" +#include "xfs_bmap.h" +#include "xfs_bmap_util.h" +#include "xfs_bmap_btree.h" +#include +#include +#include +#include + +void +xfs_count_page_state( + struct page *page, + int *delalloc, + int *unwritten) +{ + struct buffer_head *bh, *head; + + *delalloc = *unwritten = 0; + + bh = head = page_buffers(page); + do { + if (buffer_unwritten(bh)) + (*unwritten) = 1; + else if (buffer_delay(bh)) + (*delalloc) = 1; + } while ((bh = bh->b_this_page) != head); +} + +STATIC struct block_device * +xfs_find_bdev_for_inode( + struct inode *inode) +{ + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + + if (XFS_IS_REALTIME_INODE(ip)) + return mp->m_rtdev_targp->bt_bdev; + else + return mp->m_ddev_targp->bt_bdev; +} + +/* + * We're now finished for good with this ioend structure. + * Update the page state via the associated buffer_heads, + * release holds on the inode and bio, and finally free + * up memory. Do not use the ioend after this. + */ +STATIC void +xfs_destroy_ioend( + xfs_ioend_t *ioend) +{ + struct buffer_head *bh, *next; + + for (bh = ioend->io_buffer_head; bh; bh = next) { + next = bh->b_private; + bh->b_end_io(bh, !ioend->io_error); + } + + mempool_free(ioend, xfs_ioend_pool); +} + +/* + * Fast and loose check if this write could update the on-disk inode size. + */ +static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend) +{ + return ioend->io_offset + ioend->io_size > + XFS_I(ioend->io_inode)->i_d.di_size; +} + +STATIC int +xfs_setfilesize_trans_alloc( + struct xfs_ioend *ioend) +{ + struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount; + struct xfs_trans *tp; + int error; + + tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); + + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0); + if (error) { + xfs_trans_cancel(tp, 0); + return error; + } + + ioend->io_append_trans = tp; + + /* + * We may pass freeze protection with a transaction. So tell lockdep + * we released it. + */ + rwsem_release(&ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], + 1, _THIS_IP_); + /* + * We hand off the transaction to the completion thread now, so + * clear the flag here. + */ + current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); + return 0; +} + +/* + * Update on-disk file size now that data has been written to disk. + */ +STATIC int +xfs_setfilesize( + struct xfs_inode *ip, + struct xfs_trans *tp, + xfs_off_t offset, + size_t size) +{ + xfs_fsize_t isize; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + isize = xfs_new_eof(ip, offset + size); + if (!isize) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_trans_cancel(tp, 0); + return 0; + } + + trace_xfs_setfilesize(ip, offset, size); + + ip->i_d.di_size = isize; + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + return xfs_trans_commit(tp, 0); +} + +STATIC int +xfs_setfilesize_ioend( + struct xfs_ioend *ioend) +{ + struct xfs_inode *ip = XFS_I(ioend->io_inode); + struct xfs_trans *tp = ioend->io_append_trans; + + /* + * The transaction may have been allocated in the I/O submission thread, + * thus we need to mark ourselves as being in a transaction manually. + * Similarly for freeze protection. + */ + current_set_flags_nested(&tp->t_pflags, PF_FSTRANS); + rwsem_acquire_read(&VFS_I(ip)->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], + 0, 1, _THIS_IP_); + + return xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size); +} + +/* + * Schedule IO completion handling on the final put of an ioend. + * + * If there is no work to do we might as well call it a day and free the + * ioend right now. + */ +STATIC void +xfs_finish_ioend( + struct xfs_ioend *ioend) +{ + if (atomic_dec_and_test(&ioend->io_remaining)) { + struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount; + + if (ioend->io_type == XFS_IO_UNWRITTEN) + queue_work(mp->m_unwritten_workqueue, &ioend->io_work); + else if (ioend->io_append_trans) + queue_work(mp->m_data_workqueue, &ioend->io_work); + else + xfs_destroy_ioend(ioend); + } +} + +/* + * IO write completion. + */ +STATIC void +xfs_end_io( + struct work_struct *work) +{ + xfs_ioend_t *ioend = container_of(work, xfs_ioend_t, io_work); + struct xfs_inode *ip = XFS_I(ioend->io_inode); + int error = 0; + + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { + ioend->io_error = -EIO; + goto done; + } + if (ioend->io_error) + goto done; + + /* + * For unwritten extents we need to issue transactions to convert a + * range to normal written extens after the data I/O has finished. + */ + if (ioend->io_type == XFS_IO_UNWRITTEN) { + error = xfs_iomap_write_unwritten(ip, ioend->io_offset, + ioend->io_size); + } else if (ioend->io_append_trans) { + error = xfs_setfilesize_ioend(ioend); + } else { + ASSERT(!xfs_ioend_is_append(ioend)); + } + +done: + if (error) + ioend->io_error = error; + xfs_destroy_ioend(ioend); +} + +/* + * Allocate and initialise an IO completion structure. + * We need to track unwritten extent write completion here initially. + * We'll need to extend this for updating the ondisk inode size later + * (vs. incore size). + */ +STATIC xfs_ioend_t * +xfs_alloc_ioend( + struct inode *inode, + unsigned int type) +{ + xfs_ioend_t *ioend; + + ioend = mempool_alloc(xfs_ioend_pool, GFP_NOFS); + + /* + * Set the count to 1 initially, which will prevent an I/O + * completion callback from happening before we have started + * all the I/O from calling the completion routine too early. + */ + atomic_set(&ioend->io_remaining, 1); + ioend->io_error = 0; + ioend->io_list = NULL; + ioend->io_type = type; + ioend->io_inode = inode; + ioend->io_buffer_head = NULL; + ioend->io_buffer_tail = NULL; + ioend->io_offset = 0; + ioend->io_size = 0; + ioend->io_append_trans = NULL; + + INIT_WORK(&ioend->io_work, xfs_end_io); + return ioend; +} + +STATIC int +xfs_map_blocks( + struct inode *inode, + loff_t offset, + struct xfs_bmbt_irec *imap, + int type, + int nonblocking) +{ + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + ssize_t count = 1 << inode->i_blkbits; + xfs_fileoff_t offset_fsb, end_fsb; + int error = 0; + int bmapi_flags = XFS_BMAPI_ENTIRE; + int nimaps = 1; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + if (type == XFS_IO_UNWRITTEN) + bmapi_flags |= XFS_BMAPI_IGSTATE; + + if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) { + if (nonblocking) + return -EAGAIN; + xfs_ilock(ip, XFS_ILOCK_SHARED); + } + + ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || + (ip->i_df.if_flags & XFS_IFEXTENTS)); + ASSERT(offset <= mp->m_super->s_maxbytes); + + if (offset + count > mp->m_super->s_maxbytes) + count = mp->m_super->s_maxbytes - offset; + end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count); + offset_fsb = XFS_B_TO_FSBT(mp, offset); + error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, + imap, &nimaps, bmapi_flags); + xfs_iunlock(ip, XFS_ILOCK_SHARED); + + if (error) + return error; + + if (type == XFS_IO_DELALLOC && + (!nimaps || isnullstartblock(imap->br_startblock))) { + error = xfs_iomap_write_allocate(ip, offset, imap); + if (!error) + trace_xfs_map_blocks_alloc(ip, offset, count, type, imap); + return error; + } + +#ifdef DEBUG + if (type == XFS_IO_UNWRITTEN) { + ASSERT(nimaps); + ASSERT(imap->br_startblock != HOLESTARTBLOCK); + ASSERT(imap->br_startblock != DELAYSTARTBLOCK); + } +#endif + if (nimaps) + trace_xfs_map_blocks_found(ip, offset, count, type, imap); + return 0; +} + +STATIC int +xfs_imap_valid( + struct inode *inode, + struct xfs_bmbt_irec *imap, + xfs_off_t offset) +{ + offset >>= inode->i_blkbits; + + return offset >= imap->br_startoff && + offset < imap->br_startoff + imap->br_blockcount; +} + +/* + * BIO completion handler for buffered IO. + */ +STATIC void +xfs_end_bio( + struct bio *bio, + int error) +{ + xfs_ioend_t *ioend = bio->bi_private; + + ASSERT(atomic_read(&bio->bi_cnt) >= 1); + ioend->io_error = test_bit(BIO_UPTODATE, &bio->bi_flags) ? 0 : error; + + /* Toss bio and pass work off to an xfsdatad thread */ + bio->bi_private = NULL; + bio->bi_end_io = NULL; + bio_put(bio); + + xfs_finish_ioend(ioend); +} + +STATIC void +xfs_submit_ioend_bio( + struct writeback_control *wbc, + xfs_ioend_t *ioend, + struct bio *bio) +{ + atomic_inc(&ioend->io_remaining); + bio->bi_private = ioend; + bio->bi_end_io = xfs_end_bio; + submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE, bio); +} + +STATIC struct bio * +xfs_alloc_ioend_bio( + struct buffer_head *bh) +{ + int nvecs = bio_get_nr_vecs(bh->b_bdev); + struct bio *bio = bio_alloc(GFP_NOIO, nvecs); + + ASSERT(bio->bi_private == NULL); + bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); + bio->bi_bdev = bh->b_bdev; + return bio; +} + +STATIC void +xfs_start_buffer_writeback( + struct buffer_head *bh) +{ + ASSERT(buffer_mapped(bh)); + ASSERT(buffer_locked(bh)); + ASSERT(!buffer_delay(bh)); + ASSERT(!buffer_unwritten(bh)); + + mark_buffer_async_write(bh); + set_buffer_uptodate(bh); + clear_buffer_dirty(bh); +} + +STATIC void +xfs_start_page_writeback( + struct page *page, + int clear_dirty, + int buffers) +{ + ASSERT(PageLocked(page)); + ASSERT(!PageWriteback(page)); + + /* + * if the page was not fully cleaned, we need to ensure that the higher + * layers come back to it correctly. That means we need to keep the page + * dirty, and for WB_SYNC_ALL writeback we need to ensure the + * PAGECACHE_TAG_TOWRITE index mark is not removed so another attempt to + * write this page in this writeback sweep will be made. + */ + if (clear_dirty) { + clear_page_dirty_for_io(page); + set_page_writeback(page); + } else + set_page_writeback_keepwrite(page); + + unlock_page(page); + + /* If no buffers on the page are to be written, finish it here */ + if (!buffers) + end_page_writeback(page); +} + +static inline int xfs_bio_add_buffer(struct bio *bio, struct buffer_head *bh) +{ + return bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh)); +} + +/* + * Submit all of the bios for all of the ioends we have saved up, covering the + * initial writepage page and also any probed pages. + * + * Because we may have multiple ioends spanning a page, we need to start + * writeback on all the buffers before we submit them for I/O. If we mark the + * buffers as we got, then we can end up with a page that only has buffers + * marked async write and I/O complete on can occur before we mark the other + * buffers async write. + * + * The end result of this is that we trip a bug in end_page_writeback() because + * we call it twice for the one page as the code in end_buffer_async_write() + * assumes that all buffers on the page are started at the same time. + * + * The fix is two passes across the ioend list - one to start writeback on the + * buffer_heads, and then submit them for I/O on the second pass. + * + * If @fail is non-zero, it means that we have a situation where some part of + * the submission process has failed after we have marked paged for writeback + * and unlocked them. In this situation, we need to fail the ioend chain rather + * than submit it to IO. This typically only happens on a filesystem shutdown. + */ +STATIC void +xfs_submit_ioend( + struct writeback_control *wbc, + xfs_ioend_t *ioend, + int fail) +{ + xfs_ioend_t *head = ioend; + xfs_ioend_t *next; + struct buffer_head *bh; + struct bio *bio; + sector_t lastblock = 0; + + /* Pass 1 - start writeback */ + do { + next = ioend->io_list; + for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) + xfs_start_buffer_writeback(bh); + } while ((ioend = next) != NULL); + + /* Pass 2 - submit I/O */ + ioend = head; + do { + next = ioend->io_list; + bio = NULL; + + /* + * If we are failing the IO now, just mark the ioend with an + * error and finish it. This will run IO completion immediately + * as there is only one reference to the ioend at this point in + * time. + */ + if (fail) { + ioend->io_error = fail; + xfs_finish_ioend(ioend); + continue; + } + + for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) { + + if (!bio) { + retry: + bio = xfs_alloc_ioend_bio(bh); + } else if (bh->b_blocknr != lastblock + 1) { + xfs_submit_ioend_bio(wbc, ioend, bio); + goto retry; + } + + if (xfs_bio_add_buffer(bio, bh) != bh->b_size) { + xfs_submit_ioend_bio(wbc, ioend, bio); + goto retry; + } + + lastblock = bh->b_blocknr; + } + if (bio) + xfs_submit_ioend_bio(wbc, ioend, bio); + xfs_finish_ioend(ioend); + } while ((ioend = next) != NULL); +} + +/* + * Cancel submission of all buffer_heads so far in this endio. + * Toss the endio too. Only ever called for the initial page + * in a writepage request, so only ever one page. + */ +STATIC void +xfs_cancel_ioend( + xfs_ioend_t *ioend) +{ + xfs_ioend_t *next; + struct buffer_head *bh, *next_bh; + + do { + next = ioend->io_list; + bh = ioend->io_buffer_head; + do { + next_bh = bh->b_private; + clear_buffer_async_write(bh); + /* + * The unwritten flag is cleared when added to the + * ioend. We're not submitting for I/O so mark the + * buffer unwritten again for next time around. + */ + if (ioend->io_type == XFS_IO_UNWRITTEN) + set_buffer_unwritten(bh); + unlock_buffer(bh); + } while ((bh = next_bh) != NULL); + + mempool_free(ioend, xfs_ioend_pool); + } while ((ioend = next) != NULL); +} + +/* + * Test to see if we've been building up a completion structure for + * earlier buffers -- if so, we try to append to this ioend if we + * can, otherwise we finish off any current ioend and start another. + * Return true if we've finished the given ioend. + */ +STATIC void +xfs_add_to_ioend( + struct inode *inode, + struct buffer_head *bh, + xfs_off_t offset, + unsigned int type, + xfs_ioend_t **result, + int need_ioend) +{ + xfs_ioend_t *ioend = *result; + + if (!ioend || need_ioend || type != ioend->io_type) { + xfs_ioend_t *previous = *result; + + ioend = xfs_alloc_ioend(inode, type); + ioend->io_offset = offset; + ioend->io_buffer_head = bh; + ioend->io_buffer_tail = bh; + if (previous) + previous->io_list = ioend; + *result = ioend; + } else { + ioend->io_buffer_tail->b_private = bh; + ioend->io_buffer_tail = bh; + } + + bh->b_private = NULL; + ioend->io_size += bh->b_size; +} + +STATIC void +xfs_map_buffer( + struct inode *inode, + struct buffer_head *bh, + struct xfs_bmbt_irec *imap, + xfs_off_t offset) +{ + sector_t bn; + struct xfs_mount *m = XFS_I(inode)->i_mount; + xfs_off_t iomap_offset = XFS_FSB_TO_B(m, imap->br_startoff); + xfs_daddr_t iomap_bn = xfs_fsb_to_db(XFS_I(inode), imap->br_startblock); + + ASSERT(imap->br_startblock != HOLESTARTBLOCK); + ASSERT(imap->br_startblock != DELAYSTARTBLOCK); + + bn = (iomap_bn >> (inode->i_blkbits - BBSHIFT)) + + ((offset - iomap_offset) >> inode->i_blkbits); + + ASSERT(bn || XFS_IS_REALTIME_INODE(XFS_I(inode))); + + bh->b_blocknr = bn; + set_buffer_mapped(bh); +} + +STATIC void +xfs_map_at_offset( + struct inode *inode, + struct buffer_head *bh, + struct xfs_bmbt_irec *imap, + xfs_off_t offset) +{ + ASSERT(imap->br_startblock != HOLESTARTBLOCK); + ASSERT(imap->br_startblock != DELAYSTARTBLOCK); + + xfs_map_buffer(inode, bh, imap, offset); + set_buffer_mapped(bh); + clear_buffer_delay(bh); + clear_buffer_unwritten(bh); +} + +/* + * Test if a given page contains at least one buffer of a given @type. + * If @check_all_buffers is true, then we walk all the buffers in the page to + * try to find one of the type passed in. If it is not set, then the caller only + * needs to check the first buffer on the page for a match. + */ +STATIC bool +xfs_check_page_type( + struct page *page, + unsigned int type, + bool check_all_buffers) +{ + struct buffer_head *bh; + struct buffer_head *head; + + if (PageWriteback(page)) + return false; + if (!page->mapping) + return false; + if (!page_has_buffers(page)) + return false; + + bh = head = page_buffers(page); + do { + if (buffer_unwritten(bh)) { + if (type == XFS_IO_UNWRITTEN) + return true; + } else if (buffer_delay(bh)) { + if (type == XFS_IO_DELALLOC) + return true; + } else if (buffer_dirty(bh) && buffer_mapped(bh)) { + if (type == XFS_IO_OVERWRITE) + return true; + } + + /* If we are only checking the first buffer, we are done now. */ + if (!check_all_buffers) + break; + } while ((bh = bh->b_this_page) != head); + + return false; +} + +/* + * Allocate & map buffers for page given the extent map. Write it out. + * except for the original page of a writepage, this is called on + * delalloc/unwritten pages only, for the original page it is possible + * that the page has no mapping at all. + */ +STATIC int +xfs_convert_page( + struct inode *inode, + struct page *page, + loff_t tindex, + struct xfs_bmbt_irec *imap, + xfs_ioend_t **ioendp, + struct writeback_control *wbc) +{ + struct buffer_head *bh, *head; + xfs_off_t end_offset; + unsigned long p_offset; + unsigned int type; + int len, page_dirty; + int count = 0, done = 0, uptodate = 1; + xfs_off_t offset = page_offset(page); + + if (page->index != tindex) + goto fail; + if (!trylock_page(page)) + goto fail; + if (PageWriteback(page)) + goto fail_unlock_page; + if (page->mapping != inode->i_mapping) + goto fail_unlock_page; + if (!xfs_check_page_type(page, (*ioendp)->io_type, false)) + goto fail_unlock_page; + + /* + * page_dirty is initially a count of buffers on the page before + * EOF and is decremented as we move each into a cleanable state. + * + * Derivation: + * + * End offset is the highest offset that this page should represent. + * If we are on the last page, (end_offset & (PAGE_CACHE_SIZE - 1)) + * will evaluate non-zero and be less than PAGE_CACHE_SIZE and + * hence give us the correct page_dirty count. On any other page, + * it will be zero and in that case we need page_dirty to be the + * count of buffers on the page. + */ + end_offset = min_t(unsigned long long, + (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT, + i_size_read(inode)); + + /* + * If the current map does not span the entire page we are about to try + * to write, then give up. The only way we can write a page that spans + * multiple mappings in a single writeback iteration is via the + * xfs_vm_writepage() function. Data integrity writeback requires the + * entire page to be written in a single attempt, otherwise the part of + * the page we don't write here doesn't get written as part of the data + * integrity sync. + * + * For normal writeback, we also don't attempt to write partial pages + * here as it simply means that write_cache_pages() will see it under + * writeback and ignore the page until some point in the future, at + * which time this will be the only page in the file that needs + * writeback. Hence for more optimal IO patterns, we should always + * avoid partial page writeback due to multiple mappings on a page here. + */ + if (!xfs_imap_valid(inode, imap, end_offset)) + goto fail_unlock_page; + + len = 1 << inode->i_blkbits; + p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1), + PAGE_CACHE_SIZE); + p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE; + page_dirty = p_offset / len; + + /* + * The moment we find a buffer that doesn't match our current type + * specification or can't be written, abort the loop and start + * writeback. As per the above xfs_imap_valid() check, only + * xfs_vm_writepage() can handle partial page writeback fully - we are + * limited here to the buffers that are contiguous with the current + * ioend, and hence a buffer we can't write breaks that contiguity and + * we have to defer the rest of the IO to xfs_vm_writepage(). + */ + bh = head = page_buffers(page); + do { + if (offset >= end_offset) + break; + if (!buffer_uptodate(bh)) + uptodate = 0; + if (!(PageUptodate(page) || buffer_uptodate(bh))) { + done = 1; + break; + } + + if (buffer_unwritten(bh) || buffer_delay(bh) || + buffer_mapped(bh)) { + if (buffer_unwritten(bh)) + type = XFS_IO_UNWRITTEN; + else if (buffer_delay(bh)) + type = XFS_IO_DELALLOC; + else + type = XFS_IO_OVERWRITE; + + /* + * imap should always be valid because of the above + * partial page end_offset check on the imap. + */ + ASSERT(xfs_imap_valid(inode, imap, offset)); + + lock_buffer(bh); + if (type != XFS_IO_OVERWRITE) + xfs_map_at_offset(inode, bh, imap, offset); + xfs_add_to_ioend(inode, bh, offset, type, + ioendp, done); + + page_dirty--; + count++; + } else { + done = 1; + break; + } + } while (offset += len, (bh = bh->b_this_page) != head); + + if (uptodate && bh == head) + SetPageUptodate(page); + + if (count) { + if (--wbc->nr_to_write <= 0 && + wbc->sync_mode == WB_SYNC_NONE) + done = 1; + } + xfs_start_page_writeback(page, !page_dirty, count); + + return done; + fail_unlock_page: + unlock_page(page); + fail: + return 1; +} + +/* + * Convert & write out a cluster of pages in the same extent as defined + * by mp and following the start page. + */ +STATIC void +xfs_cluster_write( + struct inode *inode, + pgoff_t tindex, + struct xfs_bmbt_irec *imap, + xfs_ioend_t **ioendp, + struct writeback_control *wbc, + pgoff_t tlast) +{ + struct pagevec pvec; + int done = 0, i; + + pagevec_init(&pvec, 0); + while (!done && tindex <= tlast) { + unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1); + + if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len)) + break; + + for (i = 0; i < pagevec_count(&pvec); i++) { + done = xfs_convert_page(inode, pvec.pages[i], tindex++, + imap, ioendp, wbc); + if (done) + break; + } + + pagevec_release(&pvec); + cond_resched(); + } +} + +STATIC void +xfs_vm_invalidatepage( + struct page *page, + unsigned int offset, + unsigned int length) +{ + trace_xfs_invalidatepage(page->mapping->host, page, offset, + length); + block_invalidatepage(page, offset, length); +} + +/* + * If the page has delalloc buffers on it, we need to punch them out before we + * invalidate the page. If we don't, we leave a stale delalloc mapping on the + * inode that can trip a BUG() in xfs_get_blocks() later on if a direct IO read + * is done on that same region - the delalloc extent is returned when none is + * supposed to be there. + * + * We prevent this by truncating away the delalloc regions on the page before + * invalidating it. Because they are delalloc, we can do this without needing a + * transaction. Indeed - if we get ENOSPC errors, we have to be able to do this + * truncation without a transaction as there is no space left for block + * reservation (typically why we see a ENOSPC in writeback). + * + * This is not a performance critical path, so for now just do the punching a + * buffer head at a time. + */ +STATIC void +xfs_aops_discard_page( + struct page *page) +{ + struct inode *inode = page->mapping->host; + struct xfs_inode *ip = XFS_I(inode); + struct buffer_head *bh, *head; + loff_t offset = page_offset(page); + + if (!xfs_check_page_type(page, XFS_IO_DELALLOC, true)) + goto out_invalidate; + + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + goto out_invalidate; + + xfs_alert(ip->i_mount, + "page discard on page %p, inode 0x%llx, offset %llu.", + page, ip->i_ino, offset); + + xfs_ilock(ip, XFS_ILOCK_EXCL); + bh = head = page_buffers(page); + do { + int error; + xfs_fileoff_t start_fsb; + + if (!buffer_delay(bh)) + goto next_buffer; + + start_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); + error = xfs_bmap_punch_delalloc_range(ip, start_fsb, 1); + if (error) { + /* something screwed, just bail */ + if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { + xfs_alert(ip->i_mount, + "page discard unable to remove delalloc mapping."); + } + break; + } +next_buffer: + offset += 1 << inode->i_blkbits; + + } while ((bh = bh->b_this_page) != head); + + xfs_iunlock(ip, XFS_ILOCK_EXCL); +out_invalidate: + xfs_vm_invalidatepage(page, 0, PAGE_CACHE_SIZE); + return; +} + +/* + * Write out a dirty page. + * + * For delalloc space on the page we need to allocate space and flush it. + * For unwritten space on the page we need to start the conversion to + * regular allocated space. + * For any other dirty buffer heads on the page we should flush them. + */ +STATIC int +xfs_vm_writepage( + struct page *page, + struct writeback_control *wbc) +{ + struct inode *inode = page->mapping->host; + struct buffer_head *bh, *head; + struct xfs_bmbt_irec imap; + xfs_ioend_t *ioend = NULL, *iohead = NULL; + loff_t offset; + unsigned int type; + __uint64_t end_offset; + pgoff_t end_index, last_index; + ssize_t len; + int err, imap_valid = 0, uptodate = 1; + int count = 0; + int nonblocking = 0; + + trace_xfs_writepage(inode, page, 0, 0); + + ASSERT(page_has_buffers(page)); + + /* + * Refuse to write the page out if we are called from reclaim context. + * + * This avoids stack overflows when called from deeply used stacks in + * random callers for direct reclaim or memcg reclaim. We explicitly + * allow reclaim from kswapd as the stack usage there is relatively low. + * + * This should never happen except in the case of a VM regression so + * warn about it. + */ + if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == + PF_MEMALLOC)) + goto redirty; + + /* + * Given that we do not allow direct reclaim to call us, we should + * never be called while in a filesystem transaction. + */ + if (WARN_ON_ONCE(current->flags & PF_FSTRANS)) + goto redirty; + + /* Is this page beyond the end of the file? */ + offset = i_size_read(inode); + end_index = offset >> PAGE_CACHE_SHIFT; + last_index = (offset - 1) >> PAGE_CACHE_SHIFT; + + /* + * The page index is less than the end_index, adjust the end_offset + * to the highest offset that this page should represent. + * ----------------------------------------------------- + * | file mapping | | + * ----------------------------------------------------- + * | Page ... | Page N-2 | Page N-1 | Page N | | + * ^--------------------------------^----------|-------- + * | desired writeback range | see else | + * ---------------------------------^------------------| + */ + if (page->index < end_index) + end_offset = (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT; + else { + /* + * Check whether the page to write out is beyond or straddles + * i_size or not. + * ------------------------------------------------------- + * | file mapping | | + * ------------------------------------------------------- + * | Page ... | Page N-2 | Page N-1 | Page N | Beyond | + * ^--------------------------------^-----------|--------- + * | | Straddles | + * ---------------------------------^-----------|--------| + */ + unsigned offset_into_page = offset & (PAGE_CACHE_SIZE - 1); + + /* + * Skip the page if it is fully outside i_size, e.g. due to a + * truncate operation that is in progress. We must redirty the + * page so that reclaim stops reclaiming it. Otherwise + * xfs_vm_releasepage() is called on it and gets confused. + * + * Note that the end_index is unsigned long, it would overflow + * if the given offset is greater than 16TB on 32-bit system + * and if we do check the page is fully outside i_size or not + * via "if (page->index >= end_index + 1)" as "end_index + 1" + * will be evaluated to 0. Hence this page will be redirtied + * and be written out repeatedly which would result in an + * infinite loop, the user program that perform this operation + * will hang. Instead, we can verify this situation by checking + * if the page to write is totally beyond the i_size or if it's + * offset is just equal to the EOF. + */ + if (page->index > end_index || + (page->index == end_index && offset_into_page == 0)) + goto redirty; + + /* + * The page straddles i_size. It must be zeroed out on each + * and every writepage invocation because it may be mmapped. + * "A file is mapped in multiples of the page size. For a file + * that is not a multiple of the page size, the remaining + * memory is zeroed when mapped, and writes to that region are + * not written out to the file." + */ + zero_user_segment(page, offset_into_page, PAGE_CACHE_SIZE); + + /* Adjust the end_offset to the end of file */ + end_offset = offset; + } + + len = 1 << inode->i_blkbits; + + bh = head = page_buffers(page); + offset = page_offset(page); + type = XFS_IO_OVERWRITE; + + if (wbc->sync_mode == WB_SYNC_NONE) + nonblocking = 1; + + do { + int new_ioend = 0; + + if (offset >= end_offset) + break; + if (!buffer_uptodate(bh)) + uptodate = 0; + + /* + * set_page_dirty dirties all buffers in a page, independent + * of their state. The dirty state however is entirely + * meaningless for holes (!mapped && uptodate), so skip + * buffers covering holes here. + */ + if (!buffer_mapped(bh) && buffer_uptodate(bh)) { + imap_valid = 0; + continue; + } + + if (buffer_unwritten(bh)) { + if (type != XFS_IO_UNWRITTEN) { + type = XFS_IO_UNWRITTEN; + imap_valid = 0; + } + } else if (buffer_delay(bh)) { + if (type != XFS_IO_DELALLOC) { + type = XFS_IO_DELALLOC; + imap_valid = 0; + } + } else if (buffer_uptodate(bh)) { + if (type != XFS_IO_OVERWRITE) { + type = XFS_IO_OVERWRITE; + imap_valid = 0; + } + } else { + if (PageUptodate(page)) + ASSERT(buffer_mapped(bh)); + /* + * This buffer is not uptodate and will not be + * written to disk. Ensure that we will put any + * subsequent writeable buffers into a new + * ioend. + */ + imap_valid = 0; + continue; + } + + if (imap_valid) + imap_valid = xfs_imap_valid(inode, &imap, offset); + if (!imap_valid) { + /* + * If we didn't have a valid mapping then we need to + * put the new mapping into a separate ioend structure. + * This ensures non-contiguous extents always have + * separate ioends, which is particularly important + * for unwritten extent conversion at I/O completion + * time. + */ + new_ioend = 1; + err = xfs_map_blocks(inode, offset, &imap, type, + nonblocking); + if (err) + goto error; + imap_valid = xfs_imap_valid(inode, &imap, offset); + } + if (imap_valid) { + lock_buffer(bh); + if (type != XFS_IO_OVERWRITE) + xfs_map_at_offset(inode, bh, &imap, offset); + xfs_add_to_ioend(inode, bh, offset, type, &ioend, + new_ioend); + count++; + } + + if (!iohead) + iohead = ioend; + + } while (offset += len, ((bh = bh->b_this_page) != head)); + + if (uptodate && bh == head) + SetPageUptodate(page); + + xfs_start_page_writeback(page, 1, count); + + /* if there is no IO to be submitted for this page, we are done */ + if (!ioend) + return 0; + + ASSERT(iohead); + + /* + * Any errors from this point onwards need tobe reported through the IO + * completion path as we have marked the initial page as under writeback + * and unlocked it. + */ + if (imap_valid) { + xfs_off_t end_index; + + end_index = imap.br_startoff + imap.br_blockcount; + + /* to bytes */ + end_index <<= inode->i_blkbits; + + /* to pages */ + end_index = (end_index - 1) >> PAGE_CACHE_SHIFT; + + /* check against file size */ + if (end_index > last_index) + end_index = last_index; + + xfs_cluster_write(inode, page->index + 1, &imap, &ioend, + wbc, end_index); + } + + + /* + * Reserve log space if we might write beyond the on-disk inode size. + */ + err = 0; + if (ioend->io_type != XFS_IO_UNWRITTEN && xfs_ioend_is_append(ioend)) + err = xfs_setfilesize_trans_alloc(ioend); + + xfs_submit_ioend(wbc, iohead, err); + + return 0; + +error: + if (iohead) + xfs_cancel_ioend(iohead); + + if (err == -EAGAIN) + goto redirty; + + xfs_aops_discard_page(page); + ClearPageUptodate(page); + unlock_page(page); + return err; + +redirty: + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return 0; +} + +STATIC int +xfs_vm_writepages( + struct address_space *mapping, + struct writeback_control *wbc) +{ + xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED); + return generic_writepages(mapping, wbc); +} + +/* + * Called to move a page into cleanable state - and from there + * to be released. The page should already be clean. We always + * have buffer heads in this call. + * + * Returns 1 if the page is ok to release, 0 otherwise. + */ +STATIC int +xfs_vm_releasepage( + struct page *page, + gfp_t gfp_mask) +{ + int delalloc, unwritten; + + trace_xfs_releasepage(page->mapping->host, page, 0, 0); + + xfs_count_page_state(page, &delalloc, &unwritten); + + if (WARN_ON_ONCE(delalloc)) + return 0; + if (WARN_ON_ONCE(unwritten)) + return 0; + + return try_to_free_buffers(page); +} + +/* + * When we map a DIO buffer, we may need to attach an ioend that describes the + * type of write IO we are doing. This passes to the completion function the + * operations it needs to perform. If the mapping is for an overwrite wholly + * within the EOF then we don't need an ioend and so we don't allocate one. + * This avoids the unnecessary overhead of allocating and freeing ioends for + * workloads that don't require transactions on IO completion. + * + * If we get multiple mappings in a single IO, we might be mapping different + * types. But because the direct IO can only have a single private pointer, we + * need to ensure that: + * + * a) i) the ioend spans the entire region of unwritten mappings; or + * ii) the ioend spans all the mappings that cross or are beyond EOF; and + * b) if it contains unwritten extents, it is *permanently* marked as such + * + * We could do this by chaining ioends like buffered IO does, but we only + * actually get one IO completion callback from the direct IO, and that spans + * the entire IO regardless of how many mappings and IOs are needed to complete + * the DIO. There is only going to be one reference to the ioend and its life + * cycle is constrained by the DIO completion code. hence we don't need + * reference counting here. + */ +static void +xfs_map_direct( + struct inode *inode, + struct buffer_head *bh_result, + struct xfs_bmbt_irec *imap, + xfs_off_t offset) +{ + struct xfs_ioend *ioend; + xfs_off_t size = bh_result->b_size; + int type; + + if (ISUNWRITTEN(imap)) + type = XFS_IO_UNWRITTEN; + else + type = XFS_IO_OVERWRITE; + + trace_xfs_gbmap_direct(XFS_I(inode), offset, size, type, imap); + + if (bh_result->b_private) { + ioend = bh_result->b_private; + ASSERT(ioend->io_size > 0); + ASSERT(offset >= ioend->io_offset); + if (offset + size > ioend->io_offset + ioend->io_size) + ioend->io_size = offset - ioend->io_offset + size; + + if (type == XFS_IO_UNWRITTEN && type != ioend->io_type) + ioend->io_type = XFS_IO_UNWRITTEN; + + trace_xfs_gbmap_direct_update(XFS_I(inode), ioend->io_offset, + ioend->io_size, ioend->io_type, + imap); + } else if (type == XFS_IO_UNWRITTEN || + offset + size > i_size_read(inode)) { + ioend = xfs_alloc_ioend(inode, type); + ioend->io_offset = offset; + ioend->io_size = size; + + bh_result->b_private = ioend; + set_buffer_defer_completion(bh_result); + + trace_xfs_gbmap_direct_new(XFS_I(inode), offset, size, type, + imap); + } else { + trace_xfs_gbmap_direct_none(XFS_I(inode), offset, size, type, + imap); + } +} + +/* + * If this is O_DIRECT or the mpage code calling tell them how large the mapping + * is, so that we can avoid repeated get_blocks calls. + * + * If the mapping spans EOF, then we have to break the mapping up as the mapping + * for blocks beyond EOF must be marked new so that sub block regions can be + * correctly zeroed. We can't do this for mappings within EOF unless the mapping + * was just allocated or is unwritten, otherwise the callers would overwrite + * existing data with zeros. Hence we have to split the mapping into a range up + * to and including EOF, and a second mapping for beyond EOF. + */ +static void +xfs_map_trim_size( + struct inode *inode, + sector_t iblock, + struct buffer_head *bh_result, + struct xfs_bmbt_irec *imap, + xfs_off_t offset, + ssize_t size) +{ + xfs_off_t mapping_size; + + mapping_size = imap->br_startoff + imap->br_blockcount - iblock; + mapping_size <<= inode->i_blkbits; + + ASSERT(mapping_size > 0); + if (mapping_size > size) + mapping_size = size; + if (offset < i_size_read(inode) && + offset + mapping_size >= i_size_read(inode)) { + /* limit mapping to block that spans EOF */ + mapping_size = roundup_64(i_size_read(inode) - offset, + 1 << inode->i_blkbits); + } + if (mapping_size > LONG_MAX) + mapping_size = LONG_MAX; + + bh_result->b_size = mapping_size; +} + +STATIC int +__xfs_get_blocks( + struct inode *inode, + sector_t iblock, + struct buffer_head *bh_result, + int create, + int direct) +{ + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t offset_fsb, end_fsb; + int error = 0; + int lockmode = 0; + struct xfs_bmbt_irec imap; + int nimaps = 1; + xfs_off_t offset; + ssize_t size; + int new = 0; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + offset = (xfs_off_t)iblock << inode->i_blkbits; + ASSERT(bh_result->b_size >= (1 << inode->i_blkbits)); + size = bh_result->b_size; + + if (!create && direct && offset >= i_size_read(inode)) + return 0; + + /* + * Direct I/O is usually done on preallocated files, so try getting + * a block mapping without an exclusive lock first. For buffered + * writes we already have the exclusive iolock anyway, so avoiding + * a lock roundtrip here by taking the ilock exclusive from the + * beginning is a useful micro optimization. + */ + if (create && !direct) { + lockmode = XFS_ILOCK_EXCL; + xfs_ilock(ip, lockmode); + } else { + lockmode = xfs_ilock_data_map_shared(ip); + } + + ASSERT(offset <= mp->m_super->s_maxbytes); + if (offset + size > mp->m_super->s_maxbytes) + size = mp->m_super->s_maxbytes - offset; + end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size); + offset_fsb = XFS_B_TO_FSBT(mp, offset); + + error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, + &imap, &nimaps, XFS_BMAPI_ENTIRE); + if (error) + goto out_unlock; + + if (create && + (!nimaps || + (imap.br_startblock == HOLESTARTBLOCK || + imap.br_startblock == DELAYSTARTBLOCK))) { + if (direct || xfs_get_extsz_hint(ip)) { + /* + * Drop the ilock in preparation for starting the block + * allocation transaction. It will be retaken + * exclusively inside xfs_iomap_write_direct for the + * actual allocation. + */ + xfs_iunlock(ip, lockmode); + error = xfs_iomap_write_direct(ip, offset, size, + &imap, nimaps); + if (error) + return error; + new = 1; + } else { + /* + * Delalloc reservations do not require a transaction, + * we can go on without dropping the lock here. If we + * are allocating a new delalloc block, make sure that + * we set the new flag so that we mark the buffer new so + * that we know that it is newly allocated if the write + * fails. + */ + if (nimaps && imap.br_startblock == HOLESTARTBLOCK) + new = 1; + error = xfs_iomap_write_delay(ip, offset, size, &imap); + if (error) + goto out_unlock; + + xfs_iunlock(ip, lockmode); + } + trace_xfs_get_blocks_alloc(ip, offset, size, + ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN + : XFS_IO_DELALLOC, &imap); + } else if (nimaps) { + trace_xfs_get_blocks_found(ip, offset, size, + ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN + : XFS_IO_OVERWRITE, &imap); + xfs_iunlock(ip, lockmode); + } else { + trace_xfs_get_blocks_notfound(ip, offset, size); + goto out_unlock; + } + + /* trim mapping down to size requested */ + if (direct || size > (1 << inode->i_blkbits)) + xfs_map_trim_size(inode, iblock, bh_result, + &imap, offset, size); + + /* + * For unwritten extents do not report a disk address in the buffered + * read case (treat as if we're reading into a hole). + */ + if (imap.br_startblock != HOLESTARTBLOCK && + imap.br_startblock != DELAYSTARTBLOCK && + (create || !ISUNWRITTEN(&imap))) { + xfs_map_buffer(inode, bh_result, &imap, offset); + if (ISUNWRITTEN(&imap)) + set_buffer_unwritten(bh_result); + /* direct IO needs special help */ + if (create && direct) + xfs_map_direct(inode, bh_result, &imap, offset); + } + + /* + * If this is a realtime file, data may be on a different device. + * to that pointed to from the buffer_head b_bdev currently. + */ + bh_result->b_bdev = xfs_find_bdev_for_inode(inode); + + /* + * If we previously allocated a block out beyond eof and we are now + * coming back to use it then we will need to flag it as new even if it + * has a disk address. + * + * With sub-block writes into unwritten extents we also need to mark + * the buffer as new so that the unwritten parts of the buffer gets + * correctly zeroed. + */ + if (create && + ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) || + (offset >= i_size_read(inode)) || + (new || ISUNWRITTEN(&imap)))) + set_buffer_new(bh_result); + + if (imap.br_startblock == DELAYSTARTBLOCK) { + BUG_ON(direct); + if (create) { + set_buffer_uptodate(bh_result); + set_buffer_mapped(bh_result); + set_buffer_delay(bh_result); + } + } + + return 0; + +out_unlock: + xfs_iunlock(ip, lockmode); + return error; +} + +int +xfs_get_blocks( + struct inode *inode, + sector_t iblock, + struct buffer_head *bh_result, + int create) +{ + return __xfs_get_blocks(inode, iblock, bh_result, create, 0); +} + +STATIC int +xfs_get_blocks_direct( + struct inode *inode, + sector_t iblock, + struct buffer_head *bh_result, + int create) +{ + return __xfs_get_blocks(inode, iblock, bh_result, create, 1); +} + +/* + * Complete a direct I/O write request. + * + * The ioend structure is passed from __xfs_get_blocks() to tell us what to do. + * If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite + * wholly within the EOF and so there is nothing for us to do. Note that in this + * case the completion can be called in interrupt context, whereas if we have an + * ioend we will always be called in task context (i.e. from a workqueue). + */ +STATIC void +xfs_end_io_direct_write( + struct kiocb *iocb, + loff_t offset, + ssize_t size, + void *private) +{ + struct inode *inode = file_inode(iocb->ki_filp); + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + struct xfs_ioend *ioend = private; + + trace_xfs_gbmap_direct_endio(ip, offset, size, + ioend ? ioend->io_type : 0, NULL); + + if (!ioend) { + ASSERT(offset + size <= i_size_read(inode)); + return; + } + + if (XFS_FORCED_SHUTDOWN(mp)) + goto out_end_io; + + /* + * dio completion end_io functions are only called on writes if more + * than 0 bytes was written. + */ + ASSERT(size > 0); + + /* + * The ioend only maps whole blocks, while the IO may be sector aligned. + * Hence the ioend offset/size may not match the IO offset/size exactly. + * Because we don't map overwrites within EOF into the ioend, the offset + * may not match, but only if the endio spans EOF. Either way, write + * the IO sizes into the ioend so that completion processing does the + * right thing. + */ + ASSERT(offset + size <= ioend->io_offset + ioend->io_size); + ioend->io_size = size; + ioend->io_offset = offset; + + /* + * The ioend tells us whether we are doing unwritten extent conversion + * or an append transaction that updates the on-disk file size. These + * cases are the only cases where we should *potentially* be needing + * to update the VFS inode size. + * + * We need to update the in-core inode size here so that we don't end up + * with the on-disk inode size being outside the in-core inode size. We + * have no other method of updating EOF for AIO, so always do it here + * if necessary. + * + * We need to lock the test/set EOF update as we can be racing with + * other IO completions here to update the EOF. Failing to serialise + * here can result in EOF moving backwards and Bad Things Happen when + * that occurs. + */ + spin_lock(&ip->i_flags_lock); + if (offset + size > i_size_read(inode)) + i_size_write(inode, offset + size); + spin_unlock(&ip->i_flags_lock); + + /* + * If we are doing an append IO that needs to update the EOF on disk, + * do the transaction reserve now so we can use common end io + * processing. Stashing the error (if there is one) in the ioend will + * result in the ioend processing passing on the error if it is + * possible as we can't return it from here. + */ + if (ioend->io_type == XFS_IO_OVERWRITE) + ioend->io_error = xfs_setfilesize_trans_alloc(ioend); + +out_end_io: + xfs_end_io(&ioend->io_work); + return; +} + +STATIC ssize_t +xfs_vm_direct_IO( + struct kiocb *iocb, + struct iov_iter *iter, + loff_t offset) +{ + struct inode *inode = iocb->ki_filp->f_mapping->host; + struct block_device *bdev = xfs_find_bdev_for_inode(inode); + + if (iov_iter_rw(iter) == WRITE) { + return __blockdev_direct_IO(iocb, inode, bdev, iter, offset, + xfs_get_blocks_direct, + xfs_end_io_direct_write, NULL, + DIO_ASYNC_EXTEND); + } + return __blockdev_direct_IO(iocb, inode, bdev, iter, offset, + xfs_get_blocks_direct, NULL, NULL, 0); +} + +/* + * Punch out the delalloc blocks we have already allocated. + * + * Don't bother with xfs_setattr given that nothing can have made it to disk yet + * as the page is still locked at this point. + */ +STATIC void +xfs_vm_kill_delalloc_range( + struct inode *inode, + loff_t start, + loff_t end) +{ + struct xfs_inode *ip = XFS_I(inode); + xfs_fileoff_t start_fsb; + xfs_fileoff_t end_fsb; + int error; + + start_fsb = XFS_B_TO_FSB(ip->i_mount, start); + end_fsb = XFS_B_TO_FSB(ip->i_mount, end); + if (end_fsb <= start_fsb) + return; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_bmap_punch_delalloc_range(ip, start_fsb, + end_fsb - start_fsb); + if (error) { + /* something screwed, just bail */ + if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { + xfs_alert(ip->i_mount, + "xfs_vm_write_failed: unable to clean up ino %lld", + ip->i_ino); + } + } + xfs_iunlock(ip, XFS_ILOCK_EXCL); +} + +STATIC void +xfs_vm_write_failed( + struct inode *inode, + struct page *page, + loff_t pos, + unsigned len) +{ + loff_t block_offset; + loff_t block_start; + loff_t block_end; + loff_t from = pos & (PAGE_CACHE_SIZE - 1); + loff_t to = from + len; + struct buffer_head *bh, *head; + + /* + * The request pos offset might be 32 or 64 bit, this is all fine + * on 64-bit platform. However, for 64-bit pos request on 32-bit + * platform, the high 32-bit will be masked off if we evaluate the + * block_offset via (pos & PAGE_MASK) because the PAGE_MASK is + * 0xfffff000 as an unsigned long, hence the result is incorrect + * which could cause the following ASSERT failed in most cases. + * In order to avoid this, we can evaluate the block_offset of the + * start of the page by using shifts rather than masks the mismatch + * problem. + */ + block_offset = (pos >> PAGE_CACHE_SHIFT) << PAGE_CACHE_SHIFT; + + ASSERT(block_offset + from == pos); + + head = page_buffers(page); + block_start = 0; + for (bh = head; bh != head || !block_start; + bh = bh->b_this_page, block_start = block_end, + block_offset += bh->b_size) { + block_end = block_start + bh->b_size; + + /* skip buffers before the write */ + if (block_end <= from) + continue; + + /* if the buffer is after the write, we're done */ + if (block_start >= to) + break; + + if (!buffer_delay(bh)) + continue; + + if (!buffer_new(bh) && block_offset < i_size_read(inode)) + continue; + + xfs_vm_kill_delalloc_range(inode, block_offset, + block_offset + bh->b_size); + + /* + * This buffer does not contain data anymore. make sure anyone + * who finds it knows that for certain. + */ + clear_buffer_delay(bh); + clear_buffer_uptodate(bh); + clear_buffer_mapped(bh); + clear_buffer_new(bh); + clear_buffer_dirty(bh); + } + +} + +/* + * This used to call block_write_begin(), but it unlocks and releases the page + * on error, and we need that page to be able to punch stale delalloc blocks out + * on failure. hence we copy-n-waste it here and call xfs_vm_write_failed() at + * the appropriate point. + */ +STATIC int +xfs_vm_write_begin( + struct file *file, + struct address_space *mapping, + loff_t pos, + unsigned len, + unsigned flags, + struct page **pagep, + void **fsdata) +{ + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + struct page *page; + int status; + + ASSERT(len <= PAGE_CACHE_SIZE); + + page = grab_cache_page_write_begin(mapping, index, flags); + if (!page) + return -ENOMEM; + + status = __block_write_begin(page, pos, len, xfs_get_blocks); + if (unlikely(status)) { + struct inode *inode = mapping->host; + size_t isize = i_size_read(inode); + + xfs_vm_write_failed(inode, page, pos, len); + unlock_page(page); + + /* + * If the write is beyond EOF, we only want to kill blocks + * allocated in this write, not blocks that were previously + * written successfully. + */ + if (pos + len > isize) { + ssize_t start = max_t(ssize_t, pos, isize); + + truncate_pagecache_range(inode, start, pos + len); + } + + page_cache_release(page); + page = NULL; + } + + *pagep = page; + return status; +} + +/* + * On failure, we only need to kill delalloc blocks beyond EOF in the range of + * this specific write because they will never be written. Previous writes + * beyond EOF where block allocation succeeded do not need to be trashed, so + * only new blocks from this write should be trashed. For blocks within + * EOF, generic_write_end() zeros them so they are safe to leave alone and be + * written with all the other valid data. + */ +STATIC int +xfs_vm_write_end( + struct file *file, + struct address_space *mapping, + loff_t pos, + unsigned len, + unsigned copied, + struct page *page, + void *fsdata) +{ + int ret; + + ASSERT(len <= PAGE_CACHE_SIZE); + + ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); + if (unlikely(ret < len)) { + struct inode *inode = mapping->host; + size_t isize = i_size_read(inode); + loff_t to = pos + len; + + if (to > isize) { + /* only kill blocks in this write beyond EOF */ + if (pos > isize) + isize = pos; + xfs_vm_kill_delalloc_range(inode, isize, to); + truncate_pagecache_range(inode, isize, to); + } + } + return ret; +} + +STATIC sector_t +xfs_vm_bmap( + struct address_space *mapping, + sector_t block) +{ + struct inode *inode = (struct inode *)mapping->host; + struct xfs_inode *ip = XFS_I(inode); + + trace_xfs_vm_bmap(XFS_I(inode)); + xfs_ilock(ip, XFS_IOLOCK_SHARED); + filemap_write_and_wait(mapping); + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + return generic_block_bmap(mapping, block, xfs_get_blocks); +} + +STATIC int +xfs_vm_readpage( + struct file *unused, + struct page *page) +{ + return mpage_readpage(page, xfs_get_blocks); +} + +STATIC int +xfs_vm_readpages( + struct file *unused, + struct address_space *mapping, + struct list_head *pages, + unsigned nr_pages) +{ + return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks); +} + +/* + * This is basically a copy of __set_page_dirty_buffers() with one + * small tweak: buffers beyond EOF do not get marked dirty. If we mark them + * dirty, we'll never be able to clean them because we don't write buffers + * beyond EOF, and that means we can't invalidate pages that span EOF + * that have been marked dirty. Further, the dirty state can leak into + * the file interior if the file is extended, resulting in all sorts of + * bad things happening as the state does not match the underlying data. + * + * XXX: this really indicates that bufferheads in XFS need to die. Warts like + * this only exist because of bufferheads and how the generic code manages them. + */ +STATIC int +xfs_vm_set_page_dirty( + struct page *page) +{ + struct address_space *mapping = page->mapping; + struct inode *inode = mapping->host; + loff_t end_offset; + loff_t offset; + int newly_dirty; + + if (unlikely(!mapping)) + return !TestSetPageDirty(page); + + end_offset = i_size_read(inode); + offset = page_offset(page); + + spin_lock(&mapping->private_lock); + if (page_has_buffers(page)) { + struct buffer_head *head = page_buffers(page); + struct buffer_head *bh = head; + + do { + if (offset < end_offset) + set_buffer_dirty(bh); + bh = bh->b_this_page; + offset += 1 << inode->i_blkbits; + } while (bh != head); + } + newly_dirty = !TestSetPageDirty(page); + spin_unlock(&mapping->private_lock); + + if (newly_dirty) { + /* sigh - __set_page_dirty() is static, so copy it here, too */ + unsigned long flags; + + spin_lock_irqsave(&mapping->tree_lock, flags); + if (page->mapping) { /* Race with truncate? */ + WARN_ON_ONCE(!PageUptodate(page)); + account_page_dirtied(page, mapping); + radix_tree_tag_set(&mapping->page_tree, + page_index(page), PAGECACHE_TAG_DIRTY); + } + spin_unlock_irqrestore(&mapping->tree_lock, flags); + __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); + } + return newly_dirty; +} + +const struct address_space_operations xfs_address_space_operations = { + .readpage = xfs_vm_readpage, + .readpages = xfs_vm_readpages, + .writepage = xfs_vm_writepage, + .writepages = xfs_vm_writepages, + .set_page_dirty = xfs_vm_set_page_dirty, + .releasepage = xfs_vm_releasepage, + .invalidatepage = xfs_vm_invalidatepage, + .write_begin = xfs_vm_write_begin, + .write_end = xfs_vm_write_end, + .bmap = xfs_vm_bmap, + .direct_IO = xfs_vm_direct_IO, + .migratepage = buffer_migrate_page, + .is_partially_uptodate = block_is_partially_uptodate, + .error_remove_page = generic_error_remove_page, +}; diff --git a/kernel/fs/xfs/xfs_aops.h b/kernel/fs/xfs/xfs_aops.h new file mode 100644 index 000000000..ac644e013 --- /dev/null +++ b/kernel/fs/xfs/xfs_aops.h @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2005-2006 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_AOPS_H__ +#define __XFS_AOPS_H__ + +extern mempool_t *xfs_ioend_pool; + +/* + * Types of I/O for bmap clustering and I/O completion tracking. + */ +enum { + XFS_IO_DELALLOC, /* covers delalloc region */ + XFS_IO_UNWRITTEN, /* covers allocated but uninitialized data */ + XFS_IO_OVERWRITE, /* covers already allocated extent */ +}; + +#define XFS_IO_TYPES \ + { XFS_IO_DELALLOC, "delalloc" }, \ + { XFS_IO_UNWRITTEN, "unwritten" }, \ + { XFS_IO_OVERWRITE, "overwrite" } + +/* + * xfs_ioend struct manages large extent writes for XFS. + * It can manage several multi-page bio's at once. + */ +typedef struct xfs_ioend { + struct xfs_ioend *io_list; /* next ioend in chain */ + unsigned int io_type; /* delalloc / unwritten */ + int io_error; /* I/O error code */ + atomic_t io_remaining; /* hold count */ + struct inode *io_inode; /* file being written to */ + struct buffer_head *io_buffer_head;/* buffer linked list head */ + struct buffer_head *io_buffer_tail;/* buffer linked list tail */ + size_t io_size; /* size of the extent */ + xfs_off_t io_offset; /* offset in the file */ + struct work_struct io_work; /* xfsdatad work queue */ + struct xfs_trans *io_append_trans;/* xact. for size update */ +} xfs_ioend_t; + +extern const struct address_space_operations xfs_address_space_operations; +extern int xfs_get_blocks(struct inode *, sector_t, struct buffer_head *, int); + +extern void xfs_count_page_state(struct page *, int *, int *); + +#endif /* __XFS_AOPS_H__ */ diff --git a/kernel/fs/xfs/xfs_attr.h b/kernel/fs/xfs/xfs_attr.h new file mode 100644 index 000000000..dd4824589 --- /dev/null +++ b/kernel/fs/xfs/xfs_attr.h @@ -0,0 +1,154 @@ +/* + * Copyright (c) 2000,2002-2003,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_ATTR_H__ +#define __XFS_ATTR_H__ + +struct xfs_inode; +struct xfs_da_args; +struct xfs_attr_list_context; + +/* + * Large attribute lists are structured around Btrees where all the data + * elements are in the leaf nodes. Attribute names are hashed into an int, + * then that int is used as the index into the Btree. Since the hashval + * of an attribute name may not be unique, we may have duplicate keys. + * The internal links in the Btree are logical block offsets into the file. + * + * Small attribute lists use a different format and are packed as tightly + * as possible so as to fit into the literal area of the inode. + */ + +/*======================================================================== + * External interfaces + *========================================================================*/ + + +#define ATTR_DONTFOLLOW 0x0001 /* -- unused, from IRIX -- */ +#define ATTR_ROOT 0x0002 /* use attrs in root (trusted) namespace */ +#define ATTR_TRUST 0x0004 /* -- unused, from IRIX -- */ +#define ATTR_SECURE 0x0008 /* use attrs in security namespace */ +#define ATTR_CREATE 0x0010 /* pure create: fail if attr already exists */ +#define ATTR_REPLACE 0x0020 /* pure set: fail if attr does not exist */ + +#define ATTR_KERNOTIME 0x1000 /* [kernel] don't update inode timestamps */ +#define ATTR_KERNOVAL 0x2000 /* [kernel] get attr size only, not value */ + +#define XFS_ATTR_FLAGS \ + { ATTR_DONTFOLLOW, "DONTFOLLOW" }, \ + { ATTR_ROOT, "ROOT" }, \ + { ATTR_TRUST, "TRUST" }, \ + { ATTR_SECURE, "SECURE" }, \ + { ATTR_CREATE, "CREATE" }, \ + { ATTR_REPLACE, "REPLACE" }, \ + { ATTR_KERNOTIME, "KERNOTIME" }, \ + { ATTR_KERNOVAL, "KERNOVAL" } + +/* + * The maximum size (into the kernel or returned from the kernel) of an + * attribute value or the buffer used for an attr_list() call. Larger + * sizes will result in an ERANGE return code. + */ +#define ATTR_MAX_VALUELEN (64*1024) /* max length of a value */ + +/* + * Define how lists of attribute names are returned to the user from + * the attr_list() call. A large, 32bit aligned, buffer is passed in + * along with its size. We put an array of offsets at the top that each + * reference an attrlist_ent_t and pack the attrlist_ent_t's at the bottom. + */ +typedef struct attrlist { + __s32 al_count; /* number of entries in attrlist */ + __s32 al_more; /* T/F: more attrs (do call again) */ + __s32 al_offset[1]; /* byte offsets of attrs [var-sized] */ +} attrlist_t; + +/* + * Show the interesting info about one attribute. This is what the + * al_offset[i] entry points to. + */ +typedef struct attrlist_ent { /* data from attr_list() */ + __u32 a_valuelen; /* number bytes in value of attr */ + char a_name[1]; /* attr name (NULL terminated) */ +} attrlist_ent_t; + +/* + * Given a pointer to the (char*) buffer containing the attr_list() result, + * and an index, return a pointer to the indicated attribute in the buffer. + */ +#define ATTR_ENTRY(buffer, index) \ + ((attrlist_ent_t *) \ + &((char *)buffer)[ ((attrlist_t *)(buffer))->al_offset[index] ]) + +/* + * Kernel-internal version of the attrlist cursor. + */ +typedef struct attrlist_cursor_kern { + __u32 hashval; /* hash value of next entry to add */ + __u32 blkno; /* block containing entry (suggestion) */ + __u32 offset; /* offset in list of equal-hashvals */ + __u16 pad1; /* padding to match user-level */ + __u8 pad2; /* padding to match user-level */ + __u8 initted; /* T/F: cursor has been initialized */ +} attrlist_cursor_kern_t; + + +/*======================================================================== + * Structure used to pass context around among the routines. + *========================================================================*/ + + +typedef int (*put_listent_func_t)(struct xfs_attr_list_context *, int, + unsigned char *, int, int, unsigned char *); + +typedef struct xfs_attr_list_context { + struct xfs_inode *dp; /* inode */ + struct attrlist_cursor_kern *cursor; /* position in list */ + char *alist; /* output buffer */ + int seen_enough; /* T/F: seen enough of list? */ + ssize_t count; /* num used entries */ + int dupcnt; /* count dup hashvals seen */ + int bufsize; /* total buffer size */ + int firstu; /* first used byte in buffer */ + int flags; /* from VOP call */ + int resynch; /* T/F: resynch with cursor */ + int put_value; /* T/F: need value for listent */ + put_listent_func_t put_listent; /* list output fmt function */ + int index; /* index into output buffer */ +} xfs_attr_list_context_t; + + +/*======================================================================== + * Function prototypes for the kernel. + *========================================================================*/ + +/* + * Overall external interface routines. + */ +int xfs_attr_inactive(struct xfs_inode *dp); +int xfs_attr_list_int(struct xfs_attr_list_context *); +int xfs_inode_hasattr(struct xfs_inode *ip); +int xfs_attr_get(struct xfs_inode *ip, const unsigned char *name, + unsigned char *value, int *valuelenp, int flags); +int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name, + unsigned char *value, int valuelen, int flags); +int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags); +int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize, + int flags, struct attrlist_cursor_kern *cursor); + + +#endif /* __XFS_ATTR_H__ */ diff --git a/kernel/fs/xfs/xfs_attr_inactive.c b/kernel/fs/xfs/xfs_attr_inactive.c new file mode 100644 index 000000000..3fbf167cf --- /dev/null +++ b/kernel/fs/xfs/xfs_attr_inactive.c @@ -0,0 +1,465 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_inode.h" +#include "xfs_alloc.h" +#include "xfs_attr_remote.h" +#include "xfs_trans.h" +#include "xfs_inode_item.h" +#include "xfs_bmap.h" +#include "xfs_attr.h" +#include "xfs_attr_leaf.h" +#include "xfs_error.h" +#include "xfs_quota.h" +#include "xfs_trace.h" +#include "xfs_dir2.h" + +/* + * Look at all the extents for this logical region, + * invalidate any buffers that are incore/in transactions. + */ +STATIC int +xfs_attr3_leaf_freextent( + struct xfs_trans **trans, + struct xfs_inode *dp, + xfs_dablk_t blkno, + int blkcnt) +{ + struct xfs_bmbt_irec map; + struct xfs_buf *bp; + xfs_dablk_t tblkno; + xfs_daddr_t dblkno; + int tblkcnt; + int dblkcnt; + int nmap; + int error; + + /* + * Roll through the "value", invalidating the attribute value's + * blocks. + */ + tblkno = blkno; + tblkcnt = blkcnt; + while (tblkcnt > 0) { + /* + * Try to remember where we decided to put the value. + */ + nmap = 1; + error = xfs_bmapi_read(dp, (xfs_fileoff_t)tblkno, tblkcnt, + &map, &nmap, XFS_BMAPI_ATTRFORK); + if (error) { + return error; + } + ASSERT(nmap == 1); + ASSERT(map.br_startblock != DELAYSTARTBLOCK); + + /* + * If it's a hole, these are already unmapped + * so there's nothing to invalidate. + */ + if (map.br_startblock != HOLESTARTBLOCK) { + + dblkno = XFS_FSB_TO_DADDR(dp->i_mount, + map.br_startblock); + dblkcnt = XFS_FSB_TO_BB(dp->i_mount, + map.br_blockcount); + bp = xfs_trans_get_buf(*trans, + dp->i_mount->m_ddev_targp, + dblkno, dblkcnt, 0); + if (!bp) + return -ENOMEM; + xfs_trans_binval(*trans, bp); + /* + * Roll to next transaction. + */ + error = xfs_trans_roll(trans, dp); + if (error) + return error; + } + + tblkno += map.br_blockcount; + tblkcnt -= map.br_blockcount; + } + + return 0; +} + +/* + * Invalidate all of the "remote" value regions pointed to by a particular + * leaf block. + * Note that we must release the lock on the buffer so that we are not + * caught holding something that the logging code wants to flush to disk. + */ +STATIC int +xfs_attr3_leaf_inactive( + struct xfs_trans **trans, + struct xfs_inode *dp, + struct xfs_buf *bp) +{ + struct xfs_attr_leafblock *leaf; + struct xfs_attr3_icleaf_hdr ichdr; + struct xfs_attr_leaf_entry *entry; + struct xfs_attr_leaf_name_remote *name_rmt; + struct xfs_attr_inactive_list *list; + struct xfs_attr_inactive_list *lp; + int error; + int count; + int size; + int tmp; + int i; + struct xfs_mount *mp = bp->b_target->bt_mount; + + leaf = bp->b_addr; + xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf); + + /* + * Count the number of "remote" value extents. + */ + count = 0; + entry = xfs_attr3_leaf_entryp(leaf); + for (i = 0; i < ichdr.count; entry++, i++) { + if (be16_to_cpu(entry->nameidx) && + ((entry->flags & XFS_ATTR_LOCAL) == 0)) { + name_rmt = xfs_attr3_leaf_name_remote(leaf, i); + if (name_rmt->valueblk) + count++; + } + } + + /* + * If there are no "remote" values, we're done. + */ + if (count == 0) { + xfs_trans_brelse(*trans, bp); + return 0; + } + + /* + * Allocate storage for a list of all the "remote" value extents. + */ + size = count * sizeof(xfs_attr_inactive_list_t); + list = kmem_alloc(size, KM_SLEEP); + + /* + * Identify each of the "remote" value extents. + */ + lp = list; + entry = xfs_attr3_leaf_entryp(leaf); + for (i = 0; i < ichdr.count; entry++, i++) { + if (be16_to_cpu(entry->nameidx) && + ((entry->flags & XFS_ATTR_LOCAL) == 0)) { + name_rmt = xfs_attr3_leaf_name_remote(leaf, i); + if (name_rmt->valueblk) { + lp->valueblk = be32_to_cpu(name_rmt->valueblk); + lp->valuelen = xfs_attr3_rmt_blocks(dp->i_mount, + be32_to_cpu(name_rmt->valuelen)); + lp++; + } + } + } + xfs_trans_brelse(*trans, bp); /* unlock for trans. in freextent() */ + + /* + * Invalidate each of the "remote" value extents. + */ + error = 0; + for (lp = list, i = 0; i < count; i++, lp++) { + tmp = xfs_attr3_leaf_freextent(trans, dp, + lp->valueblk, lp->valuelen); + + if (error == 0) + error = tmp; /* save only the 1st errno */ + } + + kmem_free(list); + return error; +} + +/* + * Recurse (gasp!) through the attribute nodes until we find leaves. + * We're doing a depth-first traversal in order to invalidate everything. + */ +STATIC int +xfs_attr3_node_inactive( + struct xfs_trans **trans, + struct xfs_inode *dp, + struct xfs_buf *bp, + int level) +{ + xfs_da_blkinfo_t *info; + xfs_da_intnode_t *node; + xfs_dablk_t child_fsb; + xfs_daddr_t parent_blkno, child_blkno; + int error, i; + struct xfs_buf *child_bp; + struct xfs_da_node_entry *btree; + struct xfs_da3_icnode_hdr ichdr; + + /* + * Since this code is recursive (gasp!) we must protect ourselves. + */ + if (level > XFS_DA_NODE_MAXDEPTH) { + xfs_trans_brelse(*trans, bp); /* no locks for later trans */ + return -EIO; + } + + node = bp->b_addr; + dp->d_ops->node_hdr_from_disk(&ichdr, node); + parent_blkno = bp->b_bn; + if (!ichdr.count) { + xfs_trans_brelse(*trans, bp); + return 0; + } + btree = dp->d_ops->node_tree_p(node); + child_fsb = be32_to_cpu(btree[0].before); + xfs_trans_brelse(*trans, bp); /* no locks for later trans */ + + /* + * If this is the node level just above the leaves, simply loop + * over the leaves removing all of them. If this is higher up + * in the tree, recurse downward. + */ + for (i = 0; i < ichdr.count; i++) { + /* + * Read the subsidiary block to see what we have to work with. + * Don't do this in a transaction. This is a depth-first + * traversal of the tree so we may deal with many blocks + * before we come back to this one. + */ + error = xfs_da3_node_read(*trans, dp, child_fsb, -2, &child_bp, + XFS_ATTR_FORK); + if (error) + return error; + if (child_bp) { + /* save for re-read later */ + child_blkno = XFS_BUF_ADDR(child_bp); + + /* + * Invalidate the subtree, however we have to. + */ + info = child_bp->b_addr; + switch (info->magic) { + case cpu_to_be16(XFS_DA_NODE_MAGIC): + case cpu_to_be16(XFS_DA3_NODE_MAGIC): + error = xfs_attr3_node_inactive(trans, dp, + child_bp, level + 1); + break; + case cpu_to_be16(XFS_ATTR_LEAF_MAGIC): + case cpu_to_be16(XFS_ATTR3_LEAF_MAGIC): + error = xfs_attr3_leaf_inactive(trans, dp, + child_bp); + break; + default: + error = -EIO; + xfs_trans_brelse(*trans, child_bp); + break; + } + if (error) + return error; + + /* + * Remove the subsidiary block from the cache + * and from the log. + */ + error = xfs_da_get_buf(*trans, dp, 0, child_blkno, + &child_bp, XFS_ATTR_FORK); + if (error) + return error; + xfs_trans_binval(*trans, child_bp); + } + + /* + * If we're not done, re-read the parent to get the next + * child block number. + */ + if (i + 1 < ichdr.count) { + error = xfs_da3_node_read(*trans, dp, 0, parent_blkno, + &bp, XFS_ATTR_FORK); + if (error) + return error; + child_fsb = be32_to_cpu(btree[i + 1].before); + xfs_trans_brelse(*trans, bp); + } + /* + * Atomically commit the whole invalidate stuff. + */ + error = xfs_trans_roll(trans, dp); + if (error) + return error; + } + + return 0; +} + +/* + * Indiscriminately delete the entire attribute fork + * + * Recurse (gasp!) through the attribute nodes until we find leaves. + * We're doing a depth-first traversal in order to invalidate everything. + */ +int +xfs_attr3_root_inactive( + struct xfs_trans **trans, + struct xfs_inode *dp) +{ + struct xfs_da_blkinfo *info; + struct xfs_buf *bp; + xfs_daddr_t blkno; + int error; + + /* + * Read block 0 to see what we have to work with. + * We only get here if we have extents, since we remove + * the extents in reverse order the extent containing + * block 0 must still be there. + */ + error = xfs_da3_node_read(*trans, dp, 0, -1, &bp, XFS_ATTR_FORK); + if (error) + return error; + blkno = bp->b_bn; + + /* + * Invalidate the tree, even if the "tree" is only a single leaf block. + * This is a depth-first traversal! + */ + info = bp->b_addr; + switch (info->magic) { + case cpu_to_be16(XFS_DA_NODE_MAGIC): + case cpu_to_be16(XFS_DA3_NODE_MAGIC): + error = xfs_attr3_node_inactive(trans, dp, bp, 1); + break; + case cpu_to_be16(XFS_ATTR_LEAF_MAGIC): + case cpu_to_be16(XFS_ATTR3_LEAF_MAGIC): + error = xfs_attr3_leaf_inactive(trans, dp, bp); + break; + default: + error = -EIO; + xfs_trans_brelse(*trans, bp); + break; + } + if (error) + return error; + + /* + * Invalidate the incore copy of the root block. + */ + error = xfs_da_get_buf(*trans, dp, 0, blkno, &bp, XFS_ATTR_FORK); + if (error) + return error; + xfs_trans_binval(*trans, bp); /* remove from cache */ + /* + * Commit the invalidate and start the next transaction. + */ + error = xfs_trans_roll(trans, dp); + + return error; +} + +/* + * xfs_attr_inactive kills all traces of an attribute fork on an inode. It + * removes both the on-disk and in-memory inode fork. Note that this also has to + * handle the condition of inodes without attributes but with an attribute fork + * configured, so we can't use xfs_inode_hasattr() here. + * + * The in-memory attribute fork is removed even on error. + */ +int +xfs_attr_inactive( + struct xfs_inode *dp) +{ + struct xfs_trans *trans; + struct xfs_mount *mp; + int cancel_flags = 0; + int lock_mode = XFS_ILOCK_SHARED; + int error = 0; + + mp = dp->i_mount; + ASSERT(! XFS_NOT_DQATTACHED(mp, dp)); + + xfs_ilock(dp, lock_mode); + if (!XFS_IFORK_Q(dp)) + goto out_destroy_fork; + xfs_iunlock(dp, lock_mode); + + /* + * Start our first transaction of the day. + * + * All future transactions during this code must be "chained" off + * this one via the trans_dup() call. All transactions will contain + * the inode, and the inode will always be marked with trans_ihold(). + * Since the inode will be locked in all transactions, we must log + * the inode in every transaction to let it float upward through + * the log. + */ + lock_mode = 0; + trans = xfs_trans_alloc(mp, XFS_TRANS_ATTRINVAL); + error = xfs_trans_reserve(trans, &M_RES(mp)->tr_attrinval, 0, 0); + if (error) + goto out_cancel; + + lock_mode = XFS_ILOCK_EXCL; + cancel_flags = XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT; + xfs_ilock(dp, lock_mode); + + if (!XFS_IFORK_Q(dp)) + goto out_cancel; + + /* + * No need to make quota reservations here. We expect to release some + * blocks, not allocate, in the common case. + */ + xfs_trans_ijoin(trans, dp, 0); + + /* invalidate and truncate the attribute fork extents */ + if (dp->i_d.di_aformat != XFS_DINODE_FMT_LOCAL) { + error = xfs_attr3_root_inactive(&trans, dp); + if (error) + goto out_cancel; + + error = xfs_itruncate_extents(&trans, dp, XFS_ATTR_FORK, 0); + if (error) + goto out_cancel; + } + + /* Reset the attribute fork - this also destroys the in-core fork */ + xfs_attr_fork_remove(dp, trans); + + error = xfs_trans_commit(trans, XFS_TRANS_RELEASE_LOG_RES); + xfs_iunlock(dp, lock_mode); + return error; + +out_cancel: + xfs_trans_cancel(trans, cancel_flags); +out_destroy_fork: + /* kill the in-core attr fork before we drop the inode lock */ + if (dp->i_afp) + xfs_idestroy_fork(dp, XFS_ATTR_FORK); + if (lock_mode) + xfs_iunlock(dp, lock_mode); + return error; +} diff --git a/kernel/fs/xfs/xfs_attr_list.c b/kernel/fs/xfs/xfs_attr_list.c new file mode 100644 index 000000000..65fb37a18 --- /dev/null +++ b/kernel/fs/xfs/xfs_attr_list.c @@ -0,0 +1,653 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_inode_item.h" +#include "xfs_bmap.h" +#include "xfs_attr.h" +#include "xfs_attr_sf.h" +#include "xfs_attr_remote.h" +#include "xfs_attr_leaf.h" +#include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_buf_item.h" +#include "xfs_cksum.h" +#include "xfs_dir2.h" + +STATIC int +xfs_attr_shortform_compare(const void *a, const void *b) +{ + xfs_attr_sf_sort_t *sa, *sb; + + sa = (xfs_attr_sf_sort_t *)a; + sb = (xfs_attr_sf_sort_t *)b; + if (sa->hash < sb->hash) { + return -1; + } else if (sa->hash > sb->hash) { + return 1; + } else { + return sa->entno - sb->entno; + } +} + +#define XFS_ISRESET_CURSOR(cursor) \ + (!((cursor)->initted) && !((cursor)->hashval) && \ + !((cursor)->blkno) && !((cursor)->offset)) +/* + * Copy out entries of shortform attribute lists for attr_list(). + * Shortform attribute lists are not stored in hashval sorted order. + * If the output buffer is not large enough to hold them all, then we + * we have to calculate each entries' hashvalue and sort them before + * we can begin returning them to the user. + */ +int +xfs_attr_shortform_list(xfs_attr_list_context_t *context) +{ + attrlist_cursor_kern_t *cursor; + xfs_attr_sf_sort_t *sbuf, *sbp; + xfs_attr_shortform_t *sf; + xfs_attr_sf_entry_t *sfe; + xfs_inode_t *dp; + int sbsize, nsbuf, count, i; + int error; + + ASSERT(context != NULL); + dp = context->dp; + ASSERT(dp != NULL); + ASSERT(dp->i_afp != NULL); + sf = (xfs_attr_shortform_t *)dp->i_afp->if_u1.if_data; + ASSERT(sf != NULL); + if (!sf->hdr.count) + return 0; + cursor = context->cursor; + ASSERT(cursor != NULL); + + trace_xfs_attr_list_sf(context); + + /* + * If the buffer is large enough and the cursor is at the start, + * do not bother with sorting since we will return everything in + * one buffer and another call using the cursor won't need to be + * made. + * Note the generous fudge factor of 16 overhead bytes per entry. + * If bufsize is zero then put_listent must be a search function + * and can just scan through what we have. + */ + if (context->bufsize == 0 || + (XFS_ISRESET_CURSOR(cursor) && + (dp->i_afp->if_bytes + sf->hdr.count * 16) < context->bufsize)) { + for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) { + error = context->put_listent(context, + sfe->flags, + sfe->nameval, + (int)sfe->namelen, + (int)sfe->valuelen, + &sfe->nameval[sfe->namelen]); + + /* + * Either search callback finished early or + * didn't fit it all in the buffer after all. + */ + if (context->seen_enough) + break; + + if (error) + return error; + sfe = XFS_ATTR_SF_NEXTENTRY(sfe); + } + trace_xfs_attr_list_sf_all(context); + return 0; + } + + /* do no more for a search callback */ + if (context->bufsize == 0) + return 0; + + /* + * It didn't all fit, so we have to sort everything on hashval. + */ + sbsize = sf->hdr.count * sizeof(*sbuf); + sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP | KM_NOFS); + + /* + * Scan the attribute list for the rest of the entries, storing + * the relevant info from only those that match into a buffer. + */ + nsbuf = 0; + for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) { + if (unlikely( + ((char *)sfe < (char *)sf) || + ((char *)sfe >= ((char *)sf + dp->i_afp->if_bytes)))) { + XFS_CORRUPTION_ERROR("xfs_attr_shortform_list", + XFS_ERRLEVEL_LOW, + context->dp->i_mount, sfe); + kmem_free(sbuf); + return -EFSCORRUPTED; + } + + sbp->entno = i; + sbp->hash = xfs_da_hashname(sfe->nameval, sfe->namelen); + sbp->name = sfe->nameval; + sbp->namelen = sfe->namelen; + /* These are bytes, and both on-disk, don't endian-flip */ + sbp->valuelen = sfe->valuelen; + sbp->flags = sfe->flags; + sfe = XFS_ATTR_SF_NEXTENTRY(sfe); + sbp++; + nsbuf++; + } + + /* + * Sort the entries on hash then entno. + */ + xfs_sort(sbuf, nsbuf, sizeof(*sbuf), xfs_attr_shortform_compare); + + /* + * Re-find our place IN THE SORTED LIST. + */ + count = 0; + cursor->initted = 1; + cursor->blkno = 0; + for (sbp = sbuf, i = 0; i < nsbuf; i++, sbp++) { + if (sbp->hash == cursor->hashval) { + if (cursor->offset == count) { + break; + } + count++; + } else if (sbp->hash > cursor->hashval) { + break; + } + } + if (i == nsbuf) { + kmem_free(sbuf); + return 0; + } + + /* + * Loop putting entries into the user buffer. + */ + for ( ; i < nsbuf; i++, sbp++) { + if (cursor->hashval != sbp->hash) { + cursor->hashval = sbp->hash; + cursor->offset = 0; + } + error = context->put_listent(context, + sbp->flags, + sbp->name, + sbp->namelen, + sbp->valuelen, + &sbp->name[sbp->namelen]); + if (error) + return error; + if (context->seen_enough) + break; + cursor->offset++; + } + + kmem_free(sbuf); + return 0; +} + +STATIC int +xfs_attr_node_list(xfs_attr_list_context_t *context) +{ + attrlist_cursor_kern_t *cursor; + xfs_attr_leafblock_t *leaf; + xfs_da_intnode_t *node; + struct xfs_attr3_icleaf_hdr leafhdr; + struct xfs_da3_icnode_hdr nodehdr; + struct xfs_da_node_entry *btree; + int error, i; + struct xfs_buf *bp; + struct xfs_inode *dp = context->dp; + struct xfs_mount *mp = dp->i_mount; + + trace_xfs_attr_node_list(context); + + cursor = context->cursor; + cursor->initted = 1; + + /* + * Do all sorts of validation on the passed-in cursor structure. + * If anything is amiss, ignore the cursor and look up the hashval + * starting from the btree root. + */ + bp = NULL; + if (cursor->blkno > 0) { + error = xfs_da3_node_read(NULL, dp, cursor->blkno, -1, + &bp, XFS_ATTR_FORK); + if ((error != 0) && (error != -EFSCORRUPTED)) + return error; + if (bp) { + struct xfs_attr_leaf_entry *entries; + + node = bp->b_addr; + switch (be16_to_cpu(node->hdr.info.magic)) { + case XFS_DA_NODE_MAGIC: + case XFS_DA3_NODE_MAGIC: + trace_xfs_attr_list_wrong_blk(context); + xfs_trans_brelse(NULL, bp); + bp = NULL; + break; + case XFS_ATTR_LEAF_MAGIC: + case XFS_ATTR3_LEAF_MAGIC: + leaf = bp->b_addr; + xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, + &leafhdr, leaf); + entries = xfs_attr3_leaf_entryp(leaf); + if (cursor->hashval > be32_to_cpu( + entries[leafhdr.count - 1].hashval)) { + trace_xfs_attr_list_wrong_blk(context); + xfs_trans_brelse(NULL, bp); + bp = NULL; + } else if (cursor->hashval <= be32_to_cpu( + entries[0].hashval)) { + trace_xfs_attr_list_wrong_blk(context); + xfs_trans_brelse(NULL, bp); + bp = NULL; + } + break; + default: + trace_xfs_attr_list_wrong_blk(context); + xfs_trans_brelse(NULL, bp); + bp = NULL; + } + } + } + + /* + * We did not find what we expected given the cursor's contents, + * so we start from the top and work down based on the hash value. + * Note that start of node block is same as start of leaf block. + */ + if (bp == NULL) { + cursor->blkno = 0; + for (;;) { + __uint16_t magic; + + error = xfs_da3_node_read(NULL, dp, + cursor->blkno, -1, &bp, + XFS_ATTR_FORK); + if (error) + return error; + node = bp->b_addr; + magic = be16_to_cpu(node->hdr.info.magic); + if (magic == XFS_ATTR_LEAF_MAGIC || + magic == XFS_ATTR3_LEAF_MAGIC) + break; + if (magic != XFS_DA_NODE_MAGIC && + magic != XFS_DA3_NODE_MAGIC) { + XFS_CORRUPTION_ERROR("xfs_attr_node_list(3)", + XFS_ERRLEVEL_LOW, + context->dp->i_mount, + node); + xfs_trans_brelse(NULL, bp); + return -EFSCORRUPTED; + } + + dp->d_ops->node_hdr_from_disk(&nodehdr, node); + btree = dp->d_ops->node_tree_p(node); + for (i = 0; i < nodehdr.count; btree++, i++) { + if (cursor->hashval + <= be32_to_cpu(btree->hashval)) { + cursor->blkno = be32_to_cpu(btree->before); + trace_xfs_attr_list_node_descend(context, + btree); + break; + } + } + if (i == nodehdr.count) { + xfs_trans_brelse(NULL, bp); + return 0; + } + xfs_trans_brelse(NULL, bp); + } + } + ASSERT(bp != NULL); + + /* + * Roll upward through the blocks, processing each leaf block in + * order. As long as there is space in the result buffer, keep + * adding the information. + */ + for (;;) { + leaf = bp->b_addr; + error = xfs_attr3_leaf_list_int(bp, context); + if (error) { + xfs_trans_brelse(NULL, bp); + return error; + } + xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &leafhdr, leaf); + if (context->seen_enough || leafhdr.forw == 0) + break; + cursor->blkno = leafhdr.forw; + xfs_trans_brelse(NULL, bp); + error = xfs_attr3_leaf_read(NULL, dp, cursor->blkno, -1, &bp); + if (error) + return error; + } + xfs_trans_brelse(NULL, bp); + return 0; +} + +/* + * Copy out attribute list entries for attr_list(), for leaf attribute lists. + */ +int +xfs_attr3_leaf_list_int( + struct xfs_buf *bp, + struct xfs_attr_list_context *context) +{ + struct attrlist_cursor_kern *cursor; + struct xfs_attr_leafblock *leaf; + struct xfs_attr3_icleaf_hdr ichdr; + struct xfs_attr_leaf_entry *entries; + struct xfs_attr_leaf_entry *entry; + int retval; + int i; + struct xfs_mount *mp = context->dp->i_mount; + + trace_xfs_attr_list_leaf(context); + + leaf = bp->b_addr; + xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf); + entries = xfs_attr3_leaf_entryp(leaf); + + cursor = context->cursor; + cursor->initted = 1; + + /* + * Re-find our place in the leaf block if this is a new syscall. + */ + if (context->resynch) { + entry = &entries[0]; + for (i = 0; i < ichdr.count; entry++, i++) { + if (be32_to_cpu(entry->hashval) == cursor->hashval) { + if (cursor->offset == context->dupcnt) { + context->dupcnt = 0; + break; + } + context->dupcnt++; + } else if (be32_to_cpu(entry->hashval) > + cursor->hashval) { + context->dupcnt = 0; + break; + } + } + if (i == ichdr.count) { + trace_xfs_attr_list_notfound(context); + return 0; + } + } else { + entry = &entries[0]; + i = 0; + } + context->resynch = 0; + + /* + * We have found our place, start copying out the new attributes. + */ + retval = 0; + for (; i < ichdr.count; entry++, i++) { + if (be32_to_cpu(entry->hashval) != cursor->hashval) { + cursor->hashval = be32_to_cpu(entry->hashval); + cursor->offset = 0; + } + + if (entry->flags & XFS_ATTR_INCOMPLETE) + continue; /* skip incomplete entries */ + + if (entry->flags & XFS_ATTR_LOCAL) { + xfs_attr_leaf_name_local_t *name_loc = + xfs_attr3_leaf_name_local(leaf, i); + + retval = context->put_listent(context, + entry->flags, + name_loc->nameval, + (int)name_loc->namelen, + be16_to_cpu(name_loc->valuelen), + &name_loc->nameval[name_loc->namelen]); + if (retval) + return retval; + } else { + xfs_attr_leaf_name_remote_t *name_rmt = + xfs_attr3_leaf_name_remote(leaf, i); + + int valuelen = be32_to_cpu(name_rmt->valuelen); + + if (context->put_value) { + xfs_da_args_t args; + + memset((char *)&args, 0, sizeof(args)); + args.geo = context->dp->i_mount->m_attr_geo; + args.dp = context->dp; + args.whichfork = XFS_ATTR_FORK; + args.valuelen = valuelen; + args.rmtvaluelen = valuelen; + args.value = kmem_alloc(valuelen, KM_SLEEP | KM_NOFS); + args.rmtblkno = be32_to_cpu(name_rmt->valueblk); + args.rmtblkcnt = xfs_attr3_rmt_blocks( + args.dp->i_mount, valuelen); + retval = xfs_attr_rmtval_get(&args); + if (retval) + return retval; + retval = context->put_listent(context, + entry->flags, + name_rmt->name, + (int)name_rmt->namelen, + valuelen, + args.value); + kmem_free(args.value); + } else { + retval = context->put_listent(context, + entry->flags, + name_rmt->name, + (int)name_rmt->namelen, + valuelen, + NULL); + } + if (retval) + return retval; + } + if (context->seen_enough) + break; + cursor->offset++; + } + trace_xfs_attr_list_leaf_end(context); + return retval; +} + +/* + * Copy out attribute entries for attr_list(), for leaf attribute lists. + */ +STATIC int +xfs_attr_leaf_list(xfs_attr_list_context_t *context) +{ + int error; + struct xfs_buf *bp; + + trace_xfs_attr_leaf_list(context); + + context->cursor->blkno = 0; + error = xfs_attr3_leaf_read(NULL, context->dp, 0, -1, &bp); + if (error) + return error; + + error = xfs_attr3_leaf_list_int(bp, context); + xfs_trans_brelse(NULL, bp); + return error; +} + +int +xfs_attr_list_int( + xfs_attr_list_context_t *context) +{ + int error; + xfs_inode_t *dp = context->dp; + uint lock_mode; + + XFS_STATS_INC(xs_attr_list); + + if (XFS_FORCED_SHUTDOWN(dp->i_mount)) + return -EIO; + + /* + * Decide on what work routines to call based on the inode size. + */ + lock_mode = xfs_ilock_attr_map_shared(dp); + if (!xfs_inode_hasattr(dp)) { + error = 0; + } else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { + error = xfs_attr_shortform_list(context); + } else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) { + error = xfs_attr_leaf_list(context); + } else { + error = xfs_attr_node_list(context); + } + xfs_iunlock(dp, lock_mode); + return error; +} + +#define ATTR_ENTBASESIZE /* minimum bytes used by an attr */ \ + (((struct attrlist_ent *) 0)->a_name - (char *) 0) +#define ATTR_ENTSIZE(namelen) /* actual bytes used by an attr */ \ + ((ATTR_ENTBASESIZE + (namelen) + 1 + sizeof(u_int32_t)-1) \ + & ~(sizeof(u_int32_t)-1)) + +/* + * Format an attribute and copy it out to the user's buffer. + * Take care to check values and protect against them changing later, + * we may be reading them directly out of a user buffer. + */ +STATIC int +xfs_attr_put_listent( + xfs_attr_list_context_t *context, + int flags, + unsigned char *name, + int namelen, + int valuelen, + unsigned char *value) +{ + struct attrlist *alist = (struct attrlist *)context->alist; + attrlist_ent_t *aep; + int arraytop; + + ASSERT(!(context->flags & ATTR_KERNOVAL)); + ASSERT(context->count >= 0); + ASSERT(context->count < (ATTR_MAX_VALUELEN/8)); + ASSERT(context->firstu >= sizeof(*alist)); + ASSERT(context->firstu <= context->bufsize); + + /* + * Only list entries in the right namespace. + */ + if (((context->flags & ATTR_SECURE) == 0) != + ((flags & XFS_ATTR_SECURE) == 0)) + return 0; + if (((context->flags & ATTR_ROOT) == 0) != + ((flags & XFS_ATTR_ROOT) == 0)) + return 0; + + arraytop = sizeof(*alist) + + context->count * sizeof(alist->al_offset[0]); + context->firstu -= ATTR_ENTSIZE(namelen); + if (context->firstu < arraytop) { + trace_xfs_attr_list_full(context); + alist->al_more = 1; + context->seen_enough = 1; + return 1; + } + + aep = (attrlist_ent_t *)&context->alist[context->firstu]; + aep->a_valuelen = valuelen; + memcpy(aep->a_name, name, namelen); + aep->a_name[namelen] = 0; + alist->al_offset[context->count++] = context->firstu; + alist->al_count = context->count; + trace_xfs_attr_list_add(context); + return 0; +} + +/* + * Generate a list of extended attribute names and optionally + * also value lengths. Positive return value follows the XFS + * convention of being an error, zero or negative return code + * is the length of the buffer returned (negated), indicating + * success. + */ +int +xfs_attr_list( + xfs_inode_t *dp, + char *buffer, + int bufsize, + int flags, + attrlist_cursor_kern_t *cursor) +{ + xfs_attr_list_context_t context; + struct attrlist *alist; + int error; + + /* + * Validate the cursor. + */ + if (cursor->pad1 || cursor->pad2) + return -EINVAL; + if ((cursor->initted == 0) && + (cursor->hashval || cursor->blkno || cursor->offset)) + return -EINVAL; + + /* + * Check for a properly aligned buffer. + */ + if (((long)buffer) & (sizeof(int)-1)) + return -EFAULT; + if (flags & ATTR_KERNOVAL) + bufsize = 0; + + /* + * Initialize the output buffer. + */ + memset(&context, 0, sizeof(context)); + context.dp = dp; + context.cursor = cursor; + context.resynch = 1; + context.flags = flags; + context.alist = buffer; + context.bufsize = (bufsize & ~(sizeof(int)-1)); /* align */ + context.firstu = context.bufsize; + context.put_listent = xfs_attr_put_listent; + + alist = (struct attrlist *)context.alist; + alist->al_count = 0; + alist->al_more = 0; + alist->al_offset[0] = context.bufsize; + + error = xfs_attr_list_int(&context); + ASSERT(error <= 0); + return error; +} diff --git a/kernel/fs/xfs/xfs_bit.c b/kernel/fs/xfs/xfs_bit.c new file mode 100644 index 000000000..0e8885a59 --- /dev/null +++ b/kernel/fs/xfs/xfs_bit.c @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_log_format.h" +#include "xfs_bit.h" + +/* + * XFS bit manipulation routines, used in non-realtime code. + */ + +/* + * Return whether bitmap is empty. + * Size is number of words in the bitmap, which is padded to word boundary + * Returns 1 for empty, 0 for non-empty. + */ +int +xfs_bitmap_empty(uint *map, uint size) +{ + uint i; + uint ret = 0; + + for (i = 0; i < size; i++) { + ret |= map[i]; + } + + return (ret == 0); +} + +/* + * Count the number of contiguous bits set in the bitmap starting with bit + * start_bit. Size is the size of the bitmap in words. + */ +int +xfs_contig_bits(uint *map, uint size, uint start_bit) +{ + uint * p = ((unsigned int *) map) + (start_bit >> BIT_TO_WORD_SHIFT); + uint result = 0; + uint tmp; + + size <<= BIT_TO_WORD_SHIFT; + + ASSERT(start_bit < size); + size -= start_bit & ~(NBWORD - 1); + start_bit &= (NBWORD - 1); + if (start_bit) { + tmp = *p++; + /* set to one first offset bits prior to start */ + tmp |= (~0U >> (NBWORD-start_bit)); + if (tmp != ~0U) + goto found; + result += NBWORD; + size -= NBWORD; + } + while (size) { + if ((tmp = *p++) != ~0U) + goto found; + result += NBWORD; + size -= NBWORD; + } + return result - start_bit; +found: + return result + ffz(tmp) - start_bit; +} + +/* + * This takes the bit number to start looking from and + * returns the next set bit from there. It returns -1 + * if there are no more bits set or the start bit is + * beyond the end of the bitmap. + * + * Size is the number of words, not bytes, in the bitmap. + */ +int xfs_next_bit(uint *map, uint size, uint start_bit) +{ + uint * p = ((unsigned int *) map) + (start_bit >> BIT_TO_WORD_SHIFT); + uint result = start_bit & ~(NBWORD - 1); + uint tmp; + + size <<= BIT_TO_WORD_SHIFT; + + if (start_bit >= size) + return -1; + size -= result; + start_bit &= (NBWORD - 1); + if (start_bit) { + tmp = *p++; + /* set to zero first offset bits prior to start */ + tmp &= (~0U << start_bit); + if (tmp != 0U) + goto found; + result += NBWORD; + size -= NBWORD; + } + while (size) { + if ((tmp = *p++) != 0U) + goto found; + result += NBWORD; + size -= NBWORD; + } + return -1; +found: + return result + ffs(tmp) - 1; +} diff --git a/kernel/fs/xfs/xfs_bmap_util.c b/kernel/fs/xfs/xfs_bmap_util.c new file mode 100644 index 000000000..a52bbd3ab --- /dev/null +++ b/kernel/fs/xfs/xfs_bmap_util.c @@ -0,0 +1,1920 @@ +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * Copyright (c) 2012 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_inode.h" +#include "xfs_btree.h" +#include "xfs_trans.h" +#include "xfs_extfree_item.h" +#include "xfs_alloc.h" +#include "xfs_bmap.h" +#include "xfs_bmap_util.h" +#include "xfs_bmap_btree.h" +#include "xfs_rtalloc.h" +#include "xfs_error.h" +#include "xfs_quota.h" +#include "xfs_trans_space.h" +#include "xfs_trace.h" +#include "xfs_icache.h" +#include "xfs_log.h" + +/* Kernel only BMAP related definitions and functions */ + +/* + * Convert the given file system block to a disk block. We have to treat it + * differently based on whether the file is a real time file or not, because the + * bmap code does. + */ +xfs_daddr_t +xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb) +{ + return (XFS_IS_REALTIME_INODE(ip) ? \ + (xfs_daddr_t)XFS_FSB_TO_BB((ip)->i_mount, (fsb)) : \ + XFS_FSB_TO_DADDR((ip)->i_mount, (fsb))); +} + +/* + * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi + * caller. Frees all the extents that need freeing, which must be done + * last due to locking considerations. We never free any extents in + * the first transaction. + * + * Return 1 if the given transaction was committed and a new one + * started, and 0 otherwise in the committed parameter. + */ +int /* error */ +xfs_bmap_finish( + xfs_trans_t **tp, /* transaction pointer addr */ + xfs_bmap_free_t *flist, /* i/o: list extents to free */ + int *committed) /* xact committed or not */ +{ + xfs_efd_log_item_t *efd; /* extent free data */ + xfs_efi_log_item_t *efi; /* extent free intention */ + int error; /* error return value */ + xfs_bmap_free_item_t *free; /* free extent item */ + struct xfs_trans_res tres; /* new log reservation */ + xfs_mount_t *mp; /* filesystem mount structure */ + xfs_bmap_free_item_t *next; /* next item on free list */ + xfs_trans_t *ntp; /* new transaction pointer */ + + ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES); + if (flist->xbf_count == 0) { + *committed = 0; + return 0; + } + ntp = *tp; + efi = xfs_trans_get_efi(ntp, flist->xbf_count); + for (free = flist->xbf_first; free; free = free->xbfi_next) + xfs_trans_log_efi_extent(ntp, efi, free->xbfi_startblock, + free->xbfi_blockcount); + + tres.tr_logres = ntp->t_log_res; + tres.tr_logcount = ntp->t_log_count; + tres.tr_logflags = XFS_TRANS_PERM_LOG_RES; + ntp = xfs_trans_dup(*tp); + error = xfs_trans_commit(*tp, 0); + *tp = ntp; + *committed = 1; + /* + * We have a new transaction, so we should return committed=1, + * even though we're returning an error. + */ + if (error) + return error; + + /* + * transaction commit worked ok so we can drop the extra ticket + * reference that we gained in xfs_trans_dup() + */ + xfs_log_ticket_put(ntp->t_ticket); + + error = xfs_trans_reserve(ntp, &tres, 0, 0); + if (error) + return error; + efd = xfs_trans_get_efd(ntp, efi, flist->xbf_count); + for (free = flist->xbf_first; free != NULL; free = next) { + next = free->xbfi_next; + if ((error = xfs_free_extent(ntp, free->xbfi_startblock, + free->xbfi_blockcount))) { + /* + * The bmap free list will be cleaned up at a + * higher level. The EFI will be canceled when + * this transaction is aborted. + * Need to force shutdown here to make sure it + * happens, since this transaction may not be + * dirty yet. + */ + mp = ntp->t_mountp; + if (!XFS_FORCED_SHUTDOWN(mp)) + xfs_force_shutdown(mp, + (error == -EFSCORRUPTED) ? + SHUTDOWN_CORRUPT_INCORE : + SHUTDOWN_META_IO_ERROR); + return error; + } + xfs_trans_log_efd_extent(ntp, efd, free->xbfi_startblock, + free->xbfi_blockcount); + xfs_bmap_del_free(flist, NULL, free); + } + return 0; +} + +int +xfs_bmap_rtalloc( + struct xfs_bmalloca *ap) /* bmap alloc argument struct */ +{ + xfs_alloctype_t atype = 0; /* type for allocation routines */ + int error; /* error return value */ + xfs_mount_t *mp; /* mount point structure */ + xfs_extlen_t prod = 0; /* product factor for allocators */ + xfs_extlen_t ralen = 0; /* realtime allocation length */ + xfs_extlen_t align; /* minimum allocation alignment */ + xfs_rtblock_t rtb; + + mp = ap->ip->i_mount; + align = xfs_get_extsz_hint(ap->ip); + prod = align / mp->m_sb.sb_rextsize; + error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, + align, 1, ap->eof, 0, + ap->conv, &ap->offset, &ap->length); + if (error) + return error; + ASSERT(ap->length); + ASSERT(ap->length % mp->m_sb.sb_rextsize == 0); + + /* + * If the offset & length are not perfectly aligned + * then kill prod, it will just get us in trouble. + */ + if (do_mod(ap->offset, align) || ap->length % align) + prod = 1; + /* + * Set ralen to be the actual requested length in rtextents. + */ + ralen = ap->length / mp->m_sb.sb_rextsize; + /* + * If the old value was close enough to MAXEXTLEN that + * we rounded up to it, cut it back so it's valid again. + * Note that if it's a really large request (bigger than + * MAXEXTLEN), we don't hear about that number, and can't + * adjust the starting point to match it. + */ + if (ralen * mp->m_sb.sb_rextsize >= MAXEXTLEN) + ralen = MAXEXTLEN / mp->m_sb.sb_rextsize; + + /* + * Lock out other modifications to the RT bitmap inode. + */ + xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL); + + /* + * If it's an allocation to an empty file at offset 0, + * pick an extent that will space things out in the rt area. + */ + if (ap->eof && ap->offset == 0) { + xfs_rtblock_t uninitialized_var(rtx); /* realtime extent no */ + + error = xfs_rtpick_extent(mp, ap->tp, ralen, &rtx); + if (error) + return error; + ap->blkno = rtx * mp->m_sb.sb_rextsize; + } else { + ap->blkno = 0; + } + + xfs_bmap_adjacent(ap); + + /* + * Realtime allocation, done through xfs_rtallocate_extent. + */ + atype = ap->blkno == 0 ? XFS_ALLOCTYPE_ANY_AG : XFS_ALLOCTYPE_NEAR_BNO; + do_div(ap->blkno, mp->m_sb.sb_rextsize); + rtb = ap->blkno; + ap->length = ralen; + if ((error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1, ap->length, + &ralen, atype, ap->wasdel, prod, &rtb))) + return error; + if (rtb == NULLFSBLOCK && prod > 1 && + (error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1, + ap->length, &ralen, atype, + ap->wasdel, 1, &rtb))) + return error; + ap->blkno = rtb; + if (ap->blkno != NULLFSBLOCK) { + ap->blkno *= mp->m_sb.sb_rextsize; + ralen *= mp->m_sb.sb_rextsize; + ap->length = ralen; + ap->ip->i_d.di_nblocks += ralen; + xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE); + if (ap->wasdel) + ap->ip->i_delayed_blks -= ralen; + /* + * Adjust the disk quota also. This was reserved + * earlier. + */ + xfs_trans_mod_dquot_byino(ap->tp, ap->ip, + ap->wasdel ? XFS_TRANS_DQ_DELRTBCOUNT : + XFS_TRANS_DQ_RTBCOUNT, (long) ralen); + } else { + ap->length = 0; + } + return 0; +} + +/* + * Check if the endoff is outside the last extent. If so the caller will grow + * the allocation to a stripe unit boundary. All offsets are considered outside + * the end of file for an empty fork, so 1 is returned in *eof in that case. + */ +int +xfs_bmap_eof( + struct xfs_inode *ip, + xfs_fileoff_t endoff, + int whichfork, + int *eof) +{ + struct xfs_bmbt_irec rec; + int error; + + error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, eof); + if (error || *eof) + return error; + + *eof = endoff >= rec.br_startoff + rec.br_blockcount; + return 0; +} + +/* + * Extent tree block counting routines. + */ + +/* + * Count leaf blocks given a range of extent records. + */ +STATIC void +xfs_bmap_count_leaves( + xfs_ifork_t *ifp, + xfs_extnum_t idx, + int numrecs, + int *count) +{ + int b; + + for (b = 0; b < numrecs; b++) { + xfs_bmbt_rec_host_t *frp = xfs_iext_get_ext(ifp, idx + b); + *count += xfs_bmbt_get_blockcount(frp); + } +} + +/* + * Count leaf blocks given a range of extent records originally + * in btree format. + */ +STATIC void +xfs_bmap_disk_count_leaves( + struct xfs_mount *mp, + struct xfs_btree_block *block, + int numrecs, + int *count) +{ + int b; + xfs_bmbt_rec_t *frp; + + for (b = 1; b <= numrecs; b++) { + frp = XFS_BMBT_REC_ADDR(mp, block, b); + *count += xfs_bmbt_disk_get_blockcount(frp); + } +} + +/* + * Recursively walks each level of a btree + * to count total fsblocks in use. + */ +STATIC int /* error */ +xfs_bmap_count_tree( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_ifork_t *ifp, /* inode fork pointer */ + xfs_fsblock_t blockno, /* file system block number */ + int levelin, /* level in btree */ + int *count) /* Count of blocks */ +{ + int error; + xfs_buf_t *bp, *nbp; + int level = levelin; + __be64 *pp; + xfs_fsblock_t bno = blockno; + xfs_fsblock_t nextbno; + struct xfs_btree_block *block, *nextblock; + int numrecs; + + error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF, + &xfs_bmbt_buf_ops); + if (error) + return error; + *count += 1; + block = XFS_BUF_TO_BLOCK(bp); + + if (--level) { + /* Not at node above leaves, count this level of nodes */ + nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); + while (nextbno != NULLFSBLOCK) { + error = xfs_btree_read_bufl(mp, tp, nextbno, 0, &nbp, + XFS_BMAP_BTREE_REF, + &xfs_bmbt_buf_ops); + if (error) + return error; + *count += 1; + nextblock = XFS_BUF_TO_BLOCK(nbp); + nextbno = be64_to_cpu(nextblock->bb_u.l.bb_rightsib); + xfs_trans_brelse(tp, nbp); + } + + /* Dive to the next level */ + pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]); + bno = be64_to_cpu(*pp); + if (unlikely((error = + xfs_bmap_count_tree(mp, tp, ifp, bno, level, count)) < 0)) { + xfs_trans_brelse(tp, bp); + XFS_ERROR_REPORT("xfs_bmap_count_tree(1)", + XFS_ERRLEVEL_LOW, mp); + return -EFSCORRUPTED; + } + xfs_trans_brelse(tp, bp); + } else { + /* count all level 1 nodes and their leaves */ + for (;;) { + nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); + numrecs = be16_to_cpu(block->bb_numrecs); + xfs_bmap_disk_count_leaves(mp, block, numrecs, count); + xfs_trans_brelse(tp, bp); + if (nextbno == NULLFSBLOCK) + break; + bno = nextbno; + error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, + XFS_BMAP_BTREE_REF, + &xfs_bmbt_buf_ops); + if (error) + return error; + *count += 1; + block = XFS_BUF_TO_BLOCK(bp); + } + } + return 0; +} + +/* + * Count fsblocks of the given fork. + */ +int /* error */ +xfs_bmap_count_blocks( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode */ + int whichfork, /* data or attr fork */ + int *count) /* out: count of blocks */ +{ + struct xfs_btree_block *block; /* current btree block */ + xfs_fsblock_t bno; /* block # of "block" */ + xfs_ifork_t *ifp; /* fork structure */ + int level; /* btree level, for checking */ + xfs_mount_t *mp; /* file system mount structure */ + __be64 *pp; /* pointer to block address */ + + bno = NULLFSBLOCK; + mp = ip->i_mount; + ifp = XFS_IFORK_PTR(ip, whichfork); + if ( XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ) { + xfs_bmap_count_leaves(ifp, 0, + ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t), + count); + return 0; + } + + /* + * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out. + */ + block = ifp->if_broot; + level = be16_to_cpu(block->bb_level); + ASSERT(level > 0); + pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes); + bno = be64_to_cpu(*pp); + ASSERT(bno != NULLFSBLOCK); + ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount); + ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks); + + if (unlikely(xfs_bmap_count_tree(mp, tp, ifp, bno, level, count) < 0)) { + XFS_ERROR_REPORT("xfs_bmap_count_blocks(2)", XFS_ERRLEVEL_LOW, + mp); + return -EFSCORRUPTED; + } + + return 0; +} + +/* + * returns 1 for success, 0 if we failed to map the extent. + */ +STATIC int +xfs_getbmapx_fix_eof_hole( + xfs_inode_t *ip, /* xfs incore inode pointer */ + struct getbmapx *out, /* output structure */ + int prealloced, /* this is a file with + * preallocated data space */ + __int64_t end, /* last block requested */ + xfs_fsblock_t startblock) +{ + __int64_t fixlen; + xfs_mount_t *mp; /* file system mount point */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_extnum_t lastx; /* last extent pointer */ + xfs_fileoff_t fileblock; + + if (startblock == HOLESTARTBLOCK) { + mp = ip->i_mount; + out->bmv_block = -1; + fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, XFS_ISIZE(ip))); + fixlen -= out->bmv_offset; + if (prealloced && out->bmv_offset + out->bmv_length == end) { + /* Came to hole at EOF. Trim it. */ + if (fixlen <= 0) + return 0; + out->bmv_length = fixlen; + } + } else { + if (startblock == DELAYSTARTBLOCK) + out->bmv_block = -2; + else + out->bmv_block = xfs_fsb_to_db(ip, startblock); + fileblock = XFS_BB_TO_FSB(ip->i_mount, out->bmv_offset); + ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + if (xfs_iext_bno_to_ext(ifp, fileblock, &lastx) && + (lastx == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))-1)) + out->bmv_oflags |= BMV_OF_LAST; + } + + return 1; +} + +/* + * Get inode's extents as described in bmv, and format for output. + * Calls formatter to fill the user's buffer until all extents + * are mapped, until the passed-in bmv->bmv_count slots have + * been filled, or until the formatter short-circuits the loop, + * if it is tracking filled-in extents on its own. + */ +int /* error code */ +xfs_getbmap( + xfs_inode_t *ip, + struct getbmapx *bmv, /* user bmap structure */ + xfs_bmap_format_t formatter, /* format to user */ + void *arg) /* formatter arg */ +{ + __int64_t bmvend; /* last block requested */ + int error = 0; /* return value */ + __int64_t fixlen; /* length for -1 case */ + int i; /* extent number */ + int lock; /* lock state */ + xfs_bmbt_irec_t *map; /* buffer for user's data */ + xfs_mount_t *mp; /* file system mount point */ + int nex; /* # of user extents can do */ + int nexleft; /* # of user extents left */ + int subnex; /* # of bmapi's can do */ + int nmap; /* number of map entries */ + struct getbmapx *out; /* output structure */ + int whichfork; /* data or attr fork */ + int prealloced; /* this is a file with + * preallocated data space */ + int iflags; /* interface flags */ + int bmapi_flags; /* flags for xfs_bmapi */ + int cur_ext = 0; + + mp = ip->i_mount; + iflags = bmv->bmv_iflags; + whichfork = iflags & BMV_IF_ATTRFORK ? XFS_ATTR_FORK : XFS_DATA_FORK; + + if (whichfork == XFS_ATTR_FORK) { + if (XFS_IFORK_Q(ip)) { + if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS && + ip->i_d.di_aformat != XFS_DINODE_FMT_BTREE && + ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL) + return -EINVAL; + } else if (unlikely( + ip->i_d.di_aformat != 0 && + ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS)) { + XFS_ERROR_REPORT("xfs_getbmap", XFS_ERRLEVEL_LOW, + ip->i_mount); + return -EFSCORRUPTED; + } + + prealloced = 0; + fixlen = 1LL << 32; + } else { + if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS && + ip->i_d.di_format != XFS_DINODE_FMT_BTREE && + ip->i_d.di_format != XFS_DINODE_FMT_LOCAL) + return -EINVAL; + + if (xfs_get_extsz_hint(ip) || + ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC|XFS_DIFLAG_APPEND)){ + prealloced = 1; + fixlen = mp->m_super->s_maxbytes; + } else { + prealloced = 0; + fixlen = XFS_ISIZE(ip); + } + } + + if (bmv->bmv_length == -1) { + fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, fixlen)); + bmv->bmv_length = + max_t(__int64_t, fixlen - bmv->bmv_offset, 0); + } else if (bmv->bmv_length == 0) { + bmv->bmv_entries = 0; + return 0; + } else if (bmv->bmv_length < 0) { + return -EINVAL; + } + + nex = bmv->bmv_count - 1; + if (nex <= 0) + return -EINVAL; + bmvend = bmv->bmv_offset + bmv->bmv_length; + + + if (bmv->bmv_count > ULONG_MAX / sizeof(struct getbmapx)) + return -ENOMEM; + out = kmem_zalloc_large(bmv->bmv_count * sizeof(struct getbmapx), 0); + if (!out) + return -ENOMEM; + + xfs_ilock(ip, XFS_IOLOCK_SHARED); + if (whichfork == XFS_DATA_FORK) { + if (!(iflags & BMV_IF_DELALLOC) && + (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size)) { + error = filemap_write_and_wait(VFS_I(ip)->i_mapping); + if (error) + goto out_unlock_iolock; + + /* + * Even after flushing the inode, there can still be + * delalloc blocks on the inode beyond EOF due to + * speculative preallocation. These are not removed + * until the release function is called or the inode + * is inactivated. Hence we cannot assert here that + * ip->i_delayed_blks == 0. + */ + } + + lock = xfs_ilock_data_map_shared(ip); + } else { + lock = xfs_ilock_attr_map_shared(ip); + } + + /* + * Don't let nex be bigger than the number of extents + * we can have assuming alternating holes and real extents. + */ + if (nex > XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1) + nex = XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1; + + bmapi_flags = xfs_bmapi_aflag(whichfork); + if (!(iflags & BMV_IF_PREALLOC)) + bmapi_flags |= XFS_BMAPI_IGSTATE; + + /* + * Allocate enough space to handle "subnex" maps at a time. + */ + error = -ENOMEM; + subnex = 16; + map = kmem_alloc(subnex * sizeof(*map), KM_MAYFAIL | KM_NOFS); + if (!map) + goto out_unlock_ilock; + + bmv->bmv_entries = 0; + + if (XFS_IFORK_NEXTENTS(ip, whichfork) == 0 && + (whichfork == XFS_ATTR_FORK || !(iflags & BMV_IF_DELALLOC))) { + error = 0; + goto out_free_map; + } + + nexleft = nex; + + do { + nmap = (nexleft > subnex) ? subnex : nexleft; + error = xfs_bmapi_read(ip, XFS_BB_TO_FSBT(mp, bmv->bmv_offset), + XFS_BB_TO_FSB(mp, bmv->bmv_length), + map, &nmap, bmapi_flags); + if (error) + goto out_free_map; + ASSERT(nmap <= subnex); + + for (i = 0; i < nmap && nexleft && bmv->bmv_length; i++) { + out[cur_ext].bmv_oflags = 0; + if (map[i].br_state == XFS_EXT_UNWRITTEN) + out[cur_ext].bmv_oflags |= BMV_OF_PREALLOC; + else if (map[i].br_startblock == DELAYSTARTBLOCK) + out[cur_ext].bmv_oflags |= BMV_OF_DELALLOC; + out[cur_ext].bmv_offset = + XFS_FSB_TO_BB(mp, map[i].br_startoff); + out[cur_ext].bmv_length = + XFS_FSB_TO_BB(mp, map[i].br_blockcount); + out[cur_ext].bmv_unused1 = 0; + out[cur_ext].bmv_unused2 = 0; + + /* + * delayed allocation extents that start beyond EOF can + * occur due to speculative EOF allocation when the + * delalloc extent is larger than the largest freespace + * extent at conversion time. These extents cannot be + * converted by data writeback, so can exist here even + * if we are not supposed to be finding delalloc + * extents. + */ + if (map[i].br_startblock == DELAYSTARTBLOCK && + map[i].br_startoff <= XFS_B_TO_FSB(mp, XFS_ISIZE(ip))) + ASSERT((iflags & BMV_IF_DELALLOC) != 0); + + if (map[i].br_startblock == HOLESTARTBLOCK && + whichfork == XFS_ATTR_FORK) { + /* came to the end of attribute fork */ + out[cur_ext].bmv_oflags |= BMV_OF_LAST; + goto out_free_map; + } + + if (!xfs_getbmapx_fix_eof_hole(ip, &out[cur_ext], + prealloced, bmvend, + map[i].br_startblock)) + goto out_free_map; + + bmv->bmv_offset = + out[cur_ext].bmv_offset + + out[cur_ext].bmv_length; + bmv->bmv_length = + max_t(__int64_t, 0, bmvend - bmv->bmv_offset); + + /* + * In case we don't want to return the hole, + * don't increase cur_ext so that we can reuse + * it in the next loop. + */ + if ((iflags & BMV_IF_NO_HOLES) && + map[i].br_startblock == HOLESTARTBLOCK) { + memset(&out[cur_ext], 0, sizeof(out[cur_ext])); + continue; + } + + nexleft--; + bmv->bmv_entries++; + cur_ext++; + } + } while (nmap && nexleft && bmv->bmv_length); + + out_free_map: + kmem_free(map); + out_unlock_ilock: + xfs_iunlock(ip, lock); + out_unlock_iolock: + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + + for (i = 0; i < cur_ext; i++) { + int full = 0; /* user array is full */ + + /* format results & advance arg */ + error = formatter(&arg, &out[i], &full); + if (error || full) + break; + } + + kmem_free(out); + return error; +} + +/* + * dead simple method of punching delalyed allocation blocks from a range in + * the inode. Walks a block at a time so will be slow, but is only executed in + * rare error cases so the overhead is not critical. This will always punch out + * both the start and end blocks, even if the ranges only partially overlap + * them, so it is up to the caller to ensure that partial blocks are not + * passed in. + */ +int +xfs_bmap_punch_delalloc_range( + struct xfs_inode *ip, + xfs_fileoff_t start_fsb, + xfs_fileoff_t length) +{ + xfs_fileoff_t remaining = length; + int error = 0; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + + do { + int done; + xfs_bmbt_irec_t imap; + int nimaps = 1; + xfs_fsblock_t firstblock; + xfs_bmap_free_t flist; + + /* + * Map the range first and check that it is a delalloc extent + * before trying to unmap the range. Otherwise we will be + * trying to remove a real extent (which requires a + * transaction) or a hole, which is probably a bad idea... + */ + error = xfs_bmapi_read(ip, start_fsb, 1, &imap, &nimaps, + XFS_BMAPI_ENTIRE); + + if (error) { + /* something screwed, just bail */ + if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { + xfs_alert(ip->i_mount, + "Failed delalloc mapping lookup ino %lld fsb %lld.", + ip->i_ino, start_fsb); + } + break; + } + if (!nimaps) { + /* nothing there */ + goto next_block; + } + if (imap.br_startblock != DELAYSTARTBLOCK) { + /* been converted, ignore */ + goto next_block; + } + WARN_ON(imap.br_blockcount == 0); + + /* + * Note: while we initialise the firstblock/flist pair, they + * should never be used because blocks should never be + * allocated or freed for a delalloc extent and hence we need + * don't cancel or finish them after the xfs_bunmapi() call. + */ + xfs_bmap_init(&flist, &firstblock); + error = xfs_bunmapi(NULL, ip, start_fsb, 1, 0, 1, &firstblock, + &flist, &done); + if (error) + break; + + ASSERT(!flist.xbf_count && !flist.xbf_first); +next_block: + start_fsb++; + remaining--; + } while(remaining > 0); + + return error; +} + +/* + * Test whether it is appropriate to check an inode for and free post EOF + * blocks. The 'force' parameter determines whether we should also consider + * regular files that are marked preallocated or append-only. + */ +bool +xfs_can_free_eofblocks(struct xfs_inode *ip, bool force) +{ + /* prealloc/delalloc exists only on regular files */ + if (!S_ISREG(ip->i_d.di_mode)) + return false; + + /* + * Zero sized files with no cached pages and delalloc blocks will not + * have speculative prealloc/delalloc blocks to remove. + */ + if (VFS_I(ip)->i_size == 0 && + VFS_I(ip)->i_mapping->nrpages == 0 && + ip->i_delayed_blks == 0) + return false; + + /* If we haven't read in the extent list, then don't do it now. */ + if (!(ip->i_df.if_flags & XFS_IFEXTENTS)) + return false; + + /* + * Do not free real preallocated or append-only files unless the file + * has delalloc blocks and we are forced to remove them. + */ + if (ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) + if (!force || ip->i_delayed_blks == 0) + return false; + + return true; +} + +/* + * This is called by xfs_inactive to free any blocks beyond eof + * when the link count isn't zero and by xfs_dm_punch_hole() when + * punching a hole to EOF. + */ +int +xfs_free_eofblocks( + xfs_mount_t *mp, + xfs_inode_t *ip, + bool need_iolock) +{ + xfs_trans_t *tp; + int error; + xfs_fileoff_t end_fsb; + xfs_fileoff_t last_fsb; + xfs_filblks_t map_len; + int nimaps; + xfs_bmbt_irec_t imap; + + /* + * Figure out if there are any blocks beyond the end + * of the file. If not, then there is nothing to do. + */ + end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip)); + last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); + if (last_fsb <= end_fsb) + return 0; + map_len = last_fsb - end_fsb; + + nimaps = 1; + xfs_ilock(ip, XFS_ILOCK_SHARED); + error = xfs_bmapi_read(ip, end_fsb, map_len, &imap, &nimaps, 0); + xfs_iunlock(ip, XFS_ILOCK_SHARED); + + if (!error && (nimaps != 0) && + (imap.br_startblock != HOLESTARTBLOCK || + ip->i_delayed_blks)) { + /* + * Attach the dquots to the inode up front. + */ + error = xfs_qm_dqattach(ip, 0); + if (error) + return error; + + /* + * There are blocks after the end of file. + * Free them up now by truncating the file to + * its current size. + */ + tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE); + + if (need_iolock) { + if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { + xfs_trans_cancel(tp, 0); + return -EAGAIN; + } + } + + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); + if (error) { + ASSERT(XFS_FORCED_SHUTDOWN(mp)); + xfs_trans_cancel(tp, 0); + if (need_iolock) + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + return error; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + + /* + * Do not update the on-disk file size. If we update the + * on-disk file size and then the system crashes before the + * contents of the file are flushed to disk then the files + * may be full of holes (ie NULL files bug). + */ + error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, + XFS_ISIZE(ip)); + if (error) { + /* + * If we get an error at this point we simply don't + * bother truncating the file. + */ + xfs_trans_cancel(tp, + (XFS_TRANS_RELEASE_LOG_RES | + XFS_TRANS_ABORT)); + } else { + error = xfs_trans_commit(tp, + XFS_TRANS_RELEASE_LOG_RES); + if (!error) + xfs_inode_clear_eofblocks_tag(ip); + } + + xfs_iunlock(ip, XFS_ILOCK_EXCL); + if (need_iolock) + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + } + return error; +} + +int +xfs_alloc_file_space( + struct xfs_inode *ip, + xfs_off_t offset, + xfs_off_t len, + int alloc_type) +{ + xfs_mount_t *mp = ip->i_mount; + xfs_off_t count; + xfs_filblks_t allocated_fsb; + xfs_filblks_t allocatesize_fsb; + xfs_extlen_t extsz, temp; + xfs_fileoff_t startoffset_fsb; + xfs_fsblock_t firstfsb; + int nimaps; + int quota_flag; + int rt; + xfs_trans_t *tp; + xfs_bmbt_irec_t imaps[1], *imapp; + xfs_bmap_free_t free_list; + uint qblocks, resblks, resrtextents; + int committed; + int error; + + trace_xfs_alloc_file_space(ip); + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + error = xfs_qm_dqattach(ip, 0); + if (error) + return error; + + if (len <= 0) + return -EINVAL; + + rt = XFS_IS_REALTIME_INODE(ip); + extsz = xfs_get_extsz_hint(ip); + + count = len; + imapp = &imaps[0]; + nimaps = 1; + startoffset_fsb = XFS_B_TO_FSBT(mp, offset); + allocatesize_fsb = XFS_B_TO_FSB(mp, count); + + /* + * Allocate file space until done or until there is an error + */ + while (allocatesize_fsb && !error) { + xfs_fileoff_t s, e; + + /* + * Determine space reservations for data/realtime. + */ + if (unlikely(extsz)) { + s = startoffset_fsb; + do_div(s, extsz); + s *= extsz; + e = startoffset_fsb + allocatesize_fsb; + if ((temp = do_mod(startoffset_fsb, extsz))) + e += temp; + if ((temp = do_mod(e, extsz))) + e += extsz - temp; + } else { + s = 0; + e = allocatesize_fsb; + } + + /* + * The transaction reservation is limited to a 32-bit block + * count, hence we need to limit the number of blocks we are + * trying to reserve to avoid an overflow. We can't allocate + * more than @nimaps extents, and an extent is limited on disk + * to MAXEXTLEN (21 bits), so use that to enforce the limit. + */ + resblks = min_t(xfs_fileoff_t, (e - s), (MAXEXTLEN * nimaps)); + if (unlikely(rt)) { + resrtextents = qblocks = resblks; + resrtextents /= mp->m_sb.sb_rextsize; + resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); + quota_flag = XFS_QMOPT_RES_RTBLKS; + } else { + resrtextents = 0; + resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, resblks); + quota_flag = XFS_QMOPT_RES_REGBLKS; + } + + /* + * Allocate and setup the transaction. + */ + tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, + resblks, resrtextents); + /* + * Check for running out of space + */ + if (error) { + /* + * Free the transaction structure. + */ + ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp)); + xfs_trans_cancel(tp, 0); + break; + } + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks, + 0, quota_flag); + if (error) + goto error1; + + xfs_trans_ijoin(tp, ip, 0); + + xfs_bmap_init(&free_list, &firstfsb); + error = xfs_bmapi_write(tp, ip, startoffset_fsb, + allocatesize_fsb, alloc_type, &firstfsb, + 0, imapp, &nimaps, &free_list); + if (error) { + goto error0; + } + + /* + * Complete the transaction + */ + error = xfs_bmap_finish(&tp, &free_list, &committed); + if (error) { + goto error0; + } + + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + if (error) { + break; + } + + allocated_fsb = imapp->br_blockcount; + + if (nimaps == 0) { + error = -ENOSPC; + break; + } + + startoffset_fsb += allocated_fsb; + allocatesize_fsb -= allocated_fsb; + } + + return error; + +error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */ + xfs_bmap_cancel(&free_list); + xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag); + +error1: /* Just cancel transaction */ + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; +} + +/* + * Zero file bytes between startoff and endoff inclusive. + * The iolock is held exclusive and no blocks are buffered. + * + * This function is used by xfs_free_file_space() to zero + * partial blocks when the range to free is not block aligned. + * When unreserving space with boundaries that are not block + * aligned we round up the start and round down the end + * boundaries and then use this function to zero the parts of + * the blocks that got dropped during the rounding. + */ +STATIC int +xfs_zero_remaining_bytes( + xfs_inode_t *ip, + xfs_off_t startoff, + xfs_off_t endoff) +{ + xfs_bmbt_irec_t imap; + xfs_fileoff_t offset_fsb; + xfs_off_t lastoffset; + xfs_off_t offset; + xfs_buf_t *bp; + xfs_mount_t *mp = ip->i_mount; + int nimap; + int error = 0; + + /* + * Avoid doing I/O beyond eof - it's not necessary + * since nothing can read beyond eof. The space will + * be zeroed when the file is extended anyway. + */ + if (startoff >= XFS_ISIZE(ip)) + return 0; + + if (endoff > XFS_ISIZE(ip)) + endoff = XFS_ISIZE(ip); + + for (offset = startoff; offset <= endoff; offset = lastoffset + 1) { + uint lock_mode; + + offset_fsb = XFS_B_TO_FSBT(mp, offset); + nimap = 1; + + lock_mode = xfs_ilock_data_map_shared(ip); + error = xfs_bmapi_read(ip, offset_fsb, 1, &imap, &nimap, 0); + xfs_iunlock(ip, lock_mode); + + if (error || nimap < 1) + break; + ASSERT(imap.br_blockcount >= 1); + ASSERT(imap.br_startoff == offset_fsb); + lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1; + if (lastoffset > endoff) + lastoffset = endoff; + if (imap.br_startblock == HOLESTARTBLOCK) + continue; + ASSERT(imap.br_startblock != DELAYSTARTBLOCK); + if (imap.br_state == XFS_EXT_UNWRITTEN) + continue; + + error = xfs_buf_read_uncached(XFS_IS_REALTIME_INODE(ip) ? + mp->m_rtdev_targp : mp->m_ddev_targp, + xfs_fsb_to_db(ip, imap.br_startblock), + BTOBB(mp->m_sb.sb_blocksize), + 0, &bp, NULL); + if (error) + return error; + + memset(bp->b_addr + + (offset - XFS_FSB_TO_B(mp, imap.br_startoff)), + 0, lastoffset - offset + 1); + + error = xfs_bwrite(bp); + xfs_buf_relse(bp); + if (error) + return error; + } + return error; +} + +int +xfs_free_file_space( + struct xfs_inode *ip, + xfs_off_t offset, + xfs_off_t len) +{ + int committed; + int done; + xfs_fileoff_t endoffset_fsb; + int error; + xfs_fsblock_t firstfsb; + xfs_bmap_free_t free_list; + xfs_bmbt_irec_t imap; + xfs_off_t ioffset; + xfs_off_t iendoffset; + xfs_extlen_t mod=0; + xfs_mount_t *mp; + int nimap; + uint resblks; + xfs_off_t rounding; + int rt; + xfs_fileoff_t startoffset_fsb; + xfs_trans_t *tp; + + mp = ip->i_mount; + + trace_xfs_free_file_space(ip); + + error = xfs_qm_dqattach(ip, 0); + if (error) + return error; + + error = 0; + if (len <= 0) /* if nothing being freed */ + return error; + rt = XFS_IS_REALTIME_INODE(ip); + startoffset_fsb = XFS_B_TO_FSB(mp, offset); + endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len); + + /* wait for the completion of any pending DIOs */ + inode_dio_wait(VFS_I(ip)); + + rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE); + ioffset = round_down(offset, rounding); + iendoffset = round_up(offset + len, rounding) - 1; + error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, ioffset, + iendoffset); + if (error) + goto out; + truncate_pagecache_range(VFS_I(ip), ioffset, iendoffset); + + /* + * Need to zero the stuff we're not freeing, on disk. + * If it's a realtime file & can't use unwritten extents then we + * actually need to zero the extent edges. Otherwise xfs_bunmapi + * will take care of it for us. + */ + if (rt && !xfs_sb_version_hasextflgbit(&mp->m_sb)) { + nimap = 1; + error = xfs_bmapi_read(ip, startoffset_fsb, 1, + &imap, &nimap, 0); + if (error) + goto out; + ASSERT(nimap == 0 || nimap == 1); + if (nimap && imap.br_startblock != HOLESTARTBLOCK) { + xfs_daddr_t block; + + ASSERT(imap.br_startblock != DELAYSTARTBLOCK); + block = imap.br_startblock; + mod = do_div(block, mp->m_sb.sb_rextsize); + if (mod) + startoffset_fsb += mp->m_sb.sb_rextsize - mod; + } + nimap = 1; + error = xfs_bmapi_read(ip, endoffset_fsb - 1, 1, + &imap, &nimap, 0); + if (error) + goto out; + ASSERT(nimap == 0 || nimap == 1); + if (nimap && imap.br_startblock != HOLESTARTBLOCK) { + ASSERT(imap.br_startblock != DELAYSTARTBLOCK); + mod++; + if (mod && (mod != mp->m_sb.sb_rextsize)) + endoffset_fsb -= mod; + } + } + if ((done = (endoffset_fsb <= startoffset_fsb))) + /* + * One contiguous piece to clear + */ + error = xfs_zero_remaining_bytes(ip, offset, offset + len - 1); + else { + /* + * Some full blocks, possibly two pieces to clear + */ + if (offset < XFS_FSB_TO_B(mp, startoffset_fsb)) + error = xfs_zero_remaining_bytes(ip, offset, + XFS_FSB_TO_B(mp, startoffset_fsb) - 1); + if (!error && + XFS_FSB_TO_B(mp, endoffset_fsb) < offset + len) + error = xfs_zero_remaining_bytes(ip, + XFS_FSB_TO_B(mp, endoffset_fsb), + offset + len - 1); + } + + /* + * free file space until done or until there is an error + */ + resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); + while (!error && !done) { + + /* + * allocate and setup the transaction. Allow this + * transaction to dip into the reserve blocks to ensure + * the freeing of the space succeeds at ENOSPC. + */ + tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, resblks, 0); + + /* + * check for running out of space + */ + if (error) { + /* + * Free the transaction structure. + */ + ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp)); + xfs_trans_cancel(tp, 0); + break; + } + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_trans_reserve_quota(tp, mp, + ip->i_udquot, ip->i_gdquot, ip->i_pdquot, + resblks, 0, XFS_QMOPT_RES_REGBLKS); + if (error) + goto error1; + + xfs_trans_ijoin(tp, ip, 0); + + /* + * issue the bunmapi() call to free the blocks + */ + xfs_bmap_init(&free_list, &firstfsb); + error = xfs_bunmapi(tp, ip, startoffset_fsb, + endoffset_fsb - startoffset_fsb, + 0, 2, &firstfsb, &free_list, &done); + if (error) { + goto error0; + } + + /* + * complete the transaction + */ + error = xfs_bmap_finish(&tp, &free_list, &committed); + if (error) { + goto error0; + } + + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + } + + out: + return error; + + error0: + xfs_bmap_cancel(&free_list); + error1: + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + goto out; +} + +/* + * Preallocate and zero a range of a file. This mechanism has the allocation + * semantics of fallocate and in addition converts data in the range to zeroes. + */ +int +xfs_zero_file_space( + struct xfs_inode *ip, + xfs_off_t offset, + xfs_off_t len) +{ + struct xfs_mount *mp = ip->i_mount; + uint blksize; + int error; + + trace_xfs_zero_file_space(ip); + + blksize = 1 << mp->m_sb.sb_blocklog; + + /* + * Punch a hole and prealloc the range. We use hole punch rather than + * unwritten extent conversion for two reasons: + * + * 1.) Hole punch handles partial block zeroing for us. + * + * 2.) If prealloc returns ENOSPC, the file range is still zero-valued + * by virtue of the hole punch. + */ + error = xfs_free_file_space(ip, offset, len); + if (error) + goto out; + + error = xfs_alloc_file_space(ip, round_down(offset, blksize), + round_up(offset + len, blksize) - + round_down(offset, blksize), + XFS_BMAPI_PREALLOC); +out: + return error; + +} + +/* + * @next_fsb will keep track of the extent currently undergoing shift. + * @stop_fsb will keep track of the extent at which we have to stop. + * If we are shifting left, we will start with block (offset + len) and + * shift each extent till last extent. + * If we are shifting right, we will start with last extent inside file space + * and continue until we reach the block corresponding to offset. + */ +static int +xfs_shift_file_space( + struct xfs_inode *ip, + xfs_off_t offset, + xfs_off_t len, + enum shift_direction direction) +{ + int done = 0; + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + int error; + struct xfs_bmap_free free_list; + xfs_fsblock_t first_block; + int committed; + xfs_fileoff_t stop_fsb; + xfs_fileoff_t next_fsb; + xfs_fileoff_t shift_fsb; + + ASSERT(direction == SHIFT_LEFT || direction == SHIFT_RIGHT); + + if (direction == SHIFT_LEFT) { + next_fsb = XFS_B_TO_FSB(mp, offset + len); + stop_fsb = XFS_B_TO_FSB(mp, VFS_I(ip)->i_size); + } else { + /* + * If right shift, delegate the work of initialization of + * next_fsb to xfs_bmap_shift_extent as it has ilock held. + */ + next_fsb = NULLFSBLOCK; + stop_fsb = XFS_B_TO_FSB(mp, offset); + } + + shift_fsb = XFS_B_TO_FSB(mp, len); + + /* + * Trim eofblocks to avoid shifting uninitialized post-eof preallocation + * into the accessible region of the file. + */ + if (xfs_can_free_eofblocks(ip, true)) { + error = xfs_free_eofblocks(mp, ip, false); + if (error) + return error; + } + + /* + * Writeback and invalidate cache for the remainder of the file as we're + * about to shift down every extent from offset to EOF. + */ + error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, + offset, -1); + if (error) + return error; + error = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping, + offset >> PAGE_CACHE_SHIFT, -1); + if (error) + return error; + + /* + * The extent shiting code works on extent granularity. So, if + * stop_fsb is not the starting block of extent, we need to split + * the extent at stop_fsb. + */ + if (direction == SHIFT_RIGHT) { + error = xfs_bmap_split_extent(ip, stop_fsb); + if (error) + return error; + } + + while (!error && !done) { + tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); + /* + * We would need to reserve permanent block for transaction. + * This will come into picture when after shifting extent into + * hole we found that adjacent extents can be merged which + * may lead to freeing of a block during record update. + */ + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, + XFS_DIOSTRAT_SPACE_RES(mp, 0), 0); + if (error) { + xfs_trans_cancel(tp, 0); + break; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot, + ip->i_gdquot, ip->i_pdquot, + XFS_DIOSTRAT_SPACE_RES(mp, 0), 0, + XFS_QMOPT_RES_REGBLKS); + if (error) + goto out; + + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + + xfs_bmap_init(&free_list, &first_block); + + /* + * We are using the write transaction in which max 2 bmbt + * updates are allowed + */ + error = xfs_bmap_shift_extents(tp, ip, &next_fsb, shift_fsb, + &done, stop_fsb, &first_block, &free_list, + direction, XFS_BMAP_MAX_SHIFT_EXTENTS); + if (error) + goto out; + + error = xfs_bmap_finish(&tp, &free_list, &committed); + if (error) + goto out; + + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + } + + return error; + +out: + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); + return error; +} + +/* + * xfs_collapse_file_space() + * This routine frees disk space and shift extent for the given file. + * The first thing we do is to free data blocks in the specified range + * by calling xfs_free_file_space(). It would also sync dirty data + * and invalidate page cache over the region on which collapse range + * is working. And Shift extent records to the left to cover a hole. + * RETURNS: + * 0 on success + * errno on error + * + */ +int +xfs_collapse_file_space( + struct xfs_inode *ip, + xfs_off_t offset, + xfs_off_t len) +{ + int error; + + ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); + trace_xfs_collapse_file_space(ip); + + error = xfs_free_file_space(ip, offset, len); + if (error) + return error; + + return xfs_shift_file_space(ip, offset, len, SHIFT_LEFT); +} + +/* + * xfs_insert_file_space() + * This routine create hole space by shifting extents for the given file. + * The first thing we do is to sync dirty data and invalidate page cache + * over the region on which insert range is working. And split an extent + * to two extents at given offset by calling xfs_bmap_split_extent. + * And shift all extent records which are laying between [offset, + * last allocated extent] to the right to reserve hole range. + * RETURNS: + * 0 on success + * errno on error + */ +int +xfs_insert_file_space( + struct xfs_inode *ip, + loff_t offset, + loff_t len) +{ + ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); + trace_xfs_insert_file_space(ip); + + return xfs_shift_file_space(ip, offset, len, SHIFT_RIGHT); +} + +/* + * We need to check that the format of the data fork in the temporary inode is + * valid for the target inode before doing the swap. This is not a problem with + * attr1 because of the fixed fork offset, but attr2 has a dynamically sized + * data fork depending on the space the attribute fork is taking so we can get + * invalid formats on the target inode. + * + * E.g. target has space for 7 extents in extent format, temp inode only has + * space for 6. If we defragment down to 7 extents, then the tmp format is a + * btree, but when swapped it needs to be in extent format. Hence we can't just + * blindly swap data forks on attr2 filesystems. + * + * Note that we check the swap in both directions so that we don't end up with + * a corrupt temporary inode, either. + * + * Note that fixing the way xfs_fsr sets up the attribute fork in the source + * inode will prevent this situation from occurring, so all we do here is + * reject and log the attempt. basically we are putting the responsibility on + * userspace to get this right. + */ +static int +xfs_swap_extents_check_format( + xfs_inode_t *ip, /* target inode */ + xfs_inode_t *tip) /* tmp inode */ +{ + + /* Should never get a local format */ + if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL || + tip->i_d.di_format == XFS_DINODE_FMT_LOCAL) + return -EINVAL; + + /* + * if the target inode has less extents that then temporary inode then + * why did userspace call us? + */ + if (ip->i_d.di_nextents < tip->i_d.di_nextents) + return -EINVAL; + + /* + * if the target inode is in extent form and the temp inode is in btree + * form then we will end up with the target inode in the wrong format + * as we already know there are less extents in the temp inode. + */ + if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && + tip->i_d.di_format == XFS_DINODE_FMT_BTREE) + return -EINVAL; + + /* Check temp in extent form to max in target */ + if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) > + XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)) + return -EINVAL; + + /* Check target in extent form to max in temp */ + if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > + XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK)) + return -EINVAL; + + /* + * If we are in a btree format, check that the temp root block will fit + * in the target and that it has enough extents to be in btree format + * in the target. + * + * Note that we have to be careful to allow btree->extent conversions + * (a common defrag case) which will occur when the temp inode is in + * extent format... + */ + if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) { + if (XFS_IFORK_BOFF(ip) && + XFS_BMAP_BMDR_SPACE(tip->i_df.if_broot) > XFS_IFORK_BOFF(ip)) + return -EINVAL; + if (XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <= + XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)) + return -EINVAL; + } + + /* Reciprocal target->temp btree format checks */ + if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) { + if (XFS_IFORK_BOFF(tip) && + XFS_BMAP_BMDR_SPACE(ip->i_df.if_broot) > XFS_IFORK_BOFF(tip)) + return -EINVAL; + if (XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <= + XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK)) + return -EINVAL; + } + + return 0; +} + +static int +xfs_swap_extent_flush( + struct xfs_inode *ip) +{ + int error; + + error = filemap_write_and_wait(VFS_I(ip)->i_mapping); + if (error) + return error; + truncate_pagecache_range(VFS_I(ip), 0, -1); + + /* Verify O_DIRECT for ftmp */ + if (VFS_I(ip)->i_mapping->nrpages) + return -EINVAL; + return 0; +} + +int +xfs_swap_extents( + xfs_inode_t *ip, /* target inode */ + xfs_inode_t *tip, /* tmp inode */ + xfs_swapext_t *sxp) +{ + xfs_mount_t *mp = ip->i_mount; + xfs_trans_t *tp; + xfs_bstat_t *sbp = &sxp->sx_stat; + xfs_ifork_t *tempifp, *ifp, *tifp; + int src_log_flags, target_log_flags; + int error = 0; + int aforkblks = 0; + int taforkblks = 0; + __uint64_t tmp; + int lock_flags; + + tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL); + if (!tempifp) { + error = -ENOMEM; + goto out; + } + + /* + * Lock the inodes against other IO, page faults and truncate to + * begin with. Then we can ensure the inodes are flushed and have no + * page cache safely. Once we have done this we can take the ilocks and + * do the rest of the checks. + */ + lock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; + xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL); + xfs_lock_two_inodes(ip, tip, XFS_MMAPLOCK_EXCL); + + /* Verify that both files have the same format */ + if ((ip->i_d.di_mode & S_IFMT) != (tip->i_d.di_mode & S_IFMT)) { + error = -EINVAL; + goto out_unlock; + } + + /* Verify both files are either real-time or non-realtime */ + if (XFS_IS_REALTIME_INODE(ip) != XFS_IS_REALTIME_INODE(tip)) { + error = -EINVAL; + goto out_unlock; + } + + error = xfs_swap_extent_flush(ip); + if (error) + goto out_unlock; + error = xfs_swap_extent_flush(tip); + if (error) + goto out_unlock; + + tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0); + if (error) { + xfs_trans_cancel(tp, 0); + goto out_unlock; + } + + /* + * Lock and join the inodes to the tansaction so that transaction commit + * or cancel will unlock the inodes from this point onwards. + */ + xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL); + lock_flags |= XFS_ILOCK_EXCL; + xfs_trans_ijoin(tp, ip, lock_flags); + xfs_trans_ijoin(tp, tip, lock_flags); + + + /* Verify all data are being swapped */ + if (sxp->sx_offset != 0 || + sxp->sx_length != ip->i_d.di_size || + sxp->sx_length != tip->i_d.di_size) { + error = -EFAULT; + goto out_trans_cancel; + } + + trace_xfs_swap_extent_before(ip, 0); + trace_xfs_swap_extent_before(tip, 1); + + /* check inode formats now that data is flushed */ + error = xfs_swap_extents_check_format(ip, tip); + if (error) { + xfs_notice(mp, + "%s: inode 0x%llx format is incompatible for exchanging.", + __func__, ip->i_ino); + goto out_trans_cancel; + } + + /* + * Compare the current change & modify times with that + * passed in. If they differ, we abort this swap. + * This is the mechanism used to ensure the calling + * process that the file was not changed out from + * under it. + */ + if ((sbp->bs_ctime.tv_sec != VFS_I(ip)->i_ctime.tv_sec) || + (sbp->bs_ctime.tv_nsec != VFS_I(ip)->i_ctime.tv_nsec) || + (sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) || + (sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) { + error = -EBUSY; + goto out_trans_cancel; + } + /* + * Count the number of extended attribute blocks + */ + if ( ((XFS_IFORK_Q(ip) != 0) && (ip->i_d.di_anextents > 0)) && + (ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) { + error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, &aforkblks); + if (error) + goto out_trans_cancel; + } + if ( ((XFS_IFORK_Q(tip) != 0) && (tip->i_d.di_anextents > 0)) && + (tip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) { + error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK, + &taforkblks); + if (error) + goto out_trans_cancel; + } + + /* + * Before we've swapped the forks, lets set the owners of the forks + * appropriately. We have to do this as we are demand paging the btree + * buffers, and so the validation done on read will expect the owner + * field to be correctly set. Once we change the owners, we can swap the + * inode forks. + * + * Note the trickiness in setting the log flags - we set the owner log + * flag on the opposite inode (i.e. the inode we are setting the new + * owner to be) because once we swap the forks and log that, log + * recovery is going to see the fork as owned by the swapped inode, + * not the pre-swapped inodes. + */ + src_log_flags = XFS_ILOG_CORE; + target_log_flags = XFS_ILOG_CORE; + if (ip->i_d.di_version == 3 && + ip->i_d.di_format == XFS_DINODE_FMT_BTREE) { + target_log_flags |= XFS_ILOG_DOWNER; + error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK, + tip->i_ino, NULL); + if (error) + goto out_trans_cancel; + } + + if (tip->i_d.di_version == 3 && + tip->i_d.di_format == XFS_DINODE_FMT_BTREE) { + src_log_flags |= XFS_ILOG_DOWNER; + error = xfs_bmbt_change_owner(tp, tip, XFS_DATA_FORK, + ip->i_ino, NULL); + if (error) + goto out_trans_cancel; + } + + /* + * Swap the data forks of the inodes + */ + ifp = &ip->i_df; + tifp = &tip->i_df; + *tempifp = *ifp; /* struct copy */ + *ifp = *tifp; /* struct copy */ + *tifp = *tempifp; /* struct copy */ + + /* + * Fix the on-disk inode values + */ + tmp = (__uint64_t)ip->i_d.di_nblocks; + ip->i_d.di_nblocks = tip->i_d.di_nblocks - taforkblks + aforkblks; + tip->i_d.di_nblocks = tmp + taforkblks - aforkblks; + + tmp = (__uint64_t) ip->i_d.di_nextents; + ip->i_d.di_nextents = tip->i_d.di_nextents; + tip->i_d.di_nextents = tmp; + + tmp = (__uint64_t) ip->i_d.di_format; + ip->i_d.di_format = tip->i_d.di_format; + tip->i_d.di_format = tmp; + + /* + * The extents in the source inode could still contain speculative + * preallocation beyond EOF (e.g. the file is open but not modified + * while defrag is in progress). In that case, we need to copy over the + * number of delalloc blocks the data fork in the source inode is + * tracking beyond EOF so that when the fork is truncated away when the + * temporary inode is unlinked we don't underrun the i_delayed_blks + * counter on that inode. + */ + ASSERT(tip->i_delayed_blks == 0); + tip->i_delayed_blks = ip->i_delayed_blks; + ip->i_delayed_blks = 0; + + switch (ip->i_d.di_format) { + case XFS_DINODE_FMT_EXTENTS: + /* If the extents fit in the inode, fix the + * pointer. Otherwise it's already NULL or + * pointing to the extent. + */ + if (ip->i_d.di_nextents <= XFS_INLINE_EXTS) { + ifp->if_u1.if_extents = + ifp->if_u2.if_inline_ext; + } + src_log_flags |= XFS_ILOG_DEXT; + break; + case XFS_DINODE_FMT_BTREE: + ASSERT(ip->i_d.di_version < 3 || + (src_log_flags & XFS_ILOG_DOWNER)); + src_log_flags |= XFS_ILOG_DBROOT; + break; + } + + switch (tip->i_d.di_format) { + case XFS_DINODE_FMT_EXTENTS: + /* If the extents fit in the inode, fix the + * pointer. Otherwise it's already NULL or + * pointing to the extent. + */ + if (tip->i_d.di_nextents <= XFS_INLINE_EXTS) { + tifp->if_u1.if_extents = + tifp->if_u2.if_inline_ext; + } + target_log_flags |= XFS_ILOG_DEXT; + break; + case XFS_DINODE_FMT_BTREE: + target_log_flags |= XFS_ILOG_DBROOT; + ASSERT(tip->i_d.di_version < 3 || + (target_log_flags & XFS_ILOG_DOWNER)); + break; + } + + xfs_trans_log_inode(tp, ip, src_log_flags); + xfs_trans_log_inode(tp, tip, target_log_flags); + + /* + * If this is a synchronous mount, make sure that the + * transaction goes to disk before returning to the user. + */ + if (mp->m_flags & XFS_MOUNT_WSYNC) + xfs_trans_set_sync(tp); + + error = xfs_trans_commit(tp, 0); + + trace_xfs_swap_extent_after(ip, 0); + trace_xfs_swap_extent_after(tip, 1); +out: + kmem_free(tempifp); + return error; + +out_unlock: + xfs_iunlock(ip, lock_flags); + xfs_iunlock(tip, lock_flags); + goto out; + +out_trans_cancel: + xfs_trans_cancel(tp, 0); + goto out; +} diff --git a/kernel/fs/xfs/xfs_bmap_util.h b/kernel/fs/xfs/xfs_bmap_util.h new file mode 100644 index 000000000..af97d9a1d --- /dev/null +++ b/kernel/fs/xfs/xfs_bmap_util.h @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_BMAP_UTIL_H__ +#define __XFS_BMAP_UTIL_H__ + +/* Kernel only BMAP related definitions and functions */ + +struct xfs_bmbt_irec; +struct xfs_bmap_free_item; +struct xfs_ifork; +struct xfs_inode; +struct xfs_mount; +struct xfs_trans; +struct xfs_bmalloca; + +int xfs_bmap_rtalloc(struct xfs_bmalloca *ap); +int xfs_bmap_eof(struct xfs_inode *ip, xfs_fileoff_t endoff, + int whichfork, int *eof); +int xfs_bmap_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip, + int whichfork, int *count); +int xfs_bmap_punch_delalloc_range(struct xfs_inode *ip, + xfs_fileoff_t start_fsb, xfs_fileoff_t length); + +/* bmap to userspace formatter - copy to user & advance pointer */ +typedef int (*xfs_bmap_format_t)(void **, struct getbmapx *, int *); +int xfs_getbmap(struct xfs_inode *ip, struct getbmapx *bmv, + xfs_bmap_format_t formatter, void *arg); + +/* functions in xfs_bmap.c that are only needed by xfs_bmap_util.c */ +void xfs_bmap_del_free(struct xfs_bmap_free *flist, + struct xfs_bmap_free_item *prev, + struct xfs_bmap_free_item *free); +int xfs_bmap_extsize_align(struct xfs_mount *mp, struct xfs_bmbt_irec *gotp, + struct xfs_bmbt_irec *prevp, xfs_extlen_t extsz, + int rt, int eof, int delay, int convert, + xfs_fileoff_t *offp, xfs_extlen_t *lenp); +void xfs_bmap_adjacent(struct xfs_bmalloca *ap); +int xfs_bmap_last_extent(struct xfs_trans *tp, struct xfs_inode *ip, + int whichfork, struct xfs_bmbt_irec *rec, + int *is_empty); + +/* preallocation and hole punch interface */ +int xfs_alloc_file_space(struct xfs_inode *ip, xfs_off_t offset, + xfs_off_t len, int alloc_type); +int xfs_free_file_space(struct xfs_inode *ip, xfs_off_t offset, + xfs_off_t len); +int xfs_zero_file_space(struct xfs_inode *ip, xfs_off_t offset, + xfs_off_t len); +int xfs_collapse_file_space(struct xfs_inode *, xfs_off_t offset, + xfs_off_t len); +int xfs_insert_file_space(struct xfs_inode *, xfs_off_t offset, + xfs_off_t len); + +/* EOF block manipulation functions */ +bool xfs_can_free_eofblocks(struct xfs_inode *ip, bool force); +int xfs_free_eofblocks(struct xfs_mount *mp, struct xfs_inode *ip, + bool need_iolock); + +int xfs_swap_extents(struct xfs_inode *ip, struct xfs_inode *tip, + struct xfs_swapext *sx); + +xfs_daddr_t xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb); + +#endif /* __XFS_BMAP_UTIL_H__ */ diff --git a/kernel/fs/xfs/xfs_buf.c b/kernel/fs/xfs/xfs_buf.c new file mode 100644 index 000000000..1790b00be --- /dev/null +++ b/kernel/fs/xfs/xfs_buf.c @@ -0,0 +1,1901 @@ +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_trace.h" +#include "xfs_log.h" + +static kmem_zone_t *xfs_buf_zone; + +#ifdef XFS_BUF_LOCK_TRACKING +# define XB_SET_OWNER(bp) ((bp)->b_last_holder = current->pid) +# define XB_CLEAR_OWNER(bp) ((bp)->b_last_holder = -1) +# define XB_GET_OWNER(bp) ((bp)->b_last_holder) +#else +# define XB_SET_OWNER(bp) do { } while (0) +# define XB_CLEAR_OWNER(bp) do { } while (0) +# define XB_GET_OWNER(bp) do { } while (0) +#endif + +#define xb_to_gfp(flags) \ + ((((flags) & XBF_READ_AHEAD) ? __GFP_NORETRY : GFP_NOFS) | __GFP_NOWARN) + + +static inline int +xfs_buf_is_vmapped( + struct xfs_buf *bp) +{ + /* + * Return true if the buffer is vmapped. + * + * b_addr is null if the buffer is not mapped, but the code is clever + * enough to know it doesn't have to map a single page, so the check has + * to be both for b_addr and bp->b_page_count > 1. + */ + return bp->b_addr && bp->b_page_count > 1; +} + +static inline int +xfs_buf_vmap_len( + struct xfs_buf *bp) +{ + return (bp->b_page_count * PAGE_SIZE) - bp->b_offset; +} + +/* + * When we mark a buffer stale, we remove the buffer from the LRU and clear the + * b_lru_ref count so that the buffer is freed immediately when the buffer + * reference count falls to zero. If the buffer is already on the LRU, we need + * to remove the reference that LRU holds on the buffer. + * + * This prevents build-up of stale buffers on the LRU. + */ +void +xfs_buf_stale( + struct xfs_buf *bp) +{ + ASSERT(xfs_buf_islocked(bp)); + + bp->b_flags |= XBF_STALE; + + /* + * Clear the delwri status so that a delwri queue walker will not + * flush this buffer to disk now that it is stale. The delwri queue has + * a reference to the buffer, so this is safe to do. + */ + bp->b_flags &= ~_XBF_DELWRI_Q; + + spin_lock(&bp->b_lock); + atomic_set(&bp->b_lru_ref, 0); + if (!(bp->b_state & XFS_BSTATE_DISPOSE) && + (list_lru_del(&bp->b_target->bt_lru, &bp->b_lru))) + atomic_dec(&bp->b_hold); + + ASSERT(atomic_read(&bp->b_hold) >= 1); + spin_unlock(&bp->b_lock); +} + +static int +xfs_buf_get_maps( + struct xfs_buf *bp, + int map_count) +{ + ASSERT(bp->b_maps == NULL); + bp->b_map_count = map_count; + + if (map_count == 1) { + bp->b_maps = &bp->__b_map; + return 0; + } + + bp->b_maps = kmem_zalloc(map_count * sizeof(struct xfs_buf_map), + KM_NOFS); + if (!bp->b_maps) + return -ENOMEM; + return 0; +} + +/* + * Frees b_pages if it was allocated. + */ +static void +xfs_buf_free_maps( + struct xfs_buf *bp) +{ + if (bp->b_maps != &bp->__b_map) { + kmem_free(bp->b_maps); + bp->b_maps = NULL; + } +} + +struct xfs_buf * +_xfs_buf_alloc( + struct xfs_buftarg *target, + struct xfs_buf_map *map, + int nmaps, + xfs_buf_flags_t flags) +{ + struct xfs_buf *bp; + int error; + int i; + + bp = kmem_zone_zalloc(xfs_buf_zone, KM_NOFS); + if (unlikely(!bp)) + return NULL; + + /* + * We don't want certain flags to appear in b_flags unless they are + * specifically set by later operations on the buffer. + */ + flags &= ~(XBF_UNMAPPED | XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD); + + atomic_set(&bp->b_hold, 1); + atomic_set(&bp->b_lru_ref, 1); + init_completion(&bp->b_iowait); + INIT_LIST_HEAD(&bp->b_lru); + INIT_LIST_HEAD(&bp->b_list); + RB_CLEAR_NODE(&bp->b_rbnode); + sema_init(&bp->b_sema, 0); /* held, no waiters */ + spin_lock_init(&bp->b_lock); + XB_SET_OWNER(bp); + bp->b_target = target; + bp->b_flags = flags; + + /* + * Set length and io_length to the same value initially. + * I/O routines should use io_length, which will be the same in + * most cases but may be reset (e.g. XFS recovery). + */ + error = xfs_buf_get_maps(bp, nmaps); + if (error) { + kmem_zone_free(xfs_buf_zone, bp); + return NULL; + } + + bp->b_bn = map[0].bm_bn; + bp->b_length = 0; + for (i = 0; i < nmaps; i++) { + bp->b_maps[i].bm_bn = map[i].bm_bn; + bp->b_maps[i].bm_len = map[i].bm_len; + bp->b_length += map[i].bm_len; + } + bp->b_io_length = bp->b_length; + + atomic_set(&bp->b_pin_count, 0); + init_waitqueue_head(&bp->b_waiters); + + XFS_STATS_INC(xb_create); + trace_xfs_buf_init(bp, _RET_IP_); + + return bp; +} + +/* + * Allocate a page array capable of holding a specified number + * of pages, and point the page buf at it. + */ +STATIC int +_xfs_buf_get_pages( + xfs_buf_t *bp, + int page_count) +{ + /* Make sure that we have a page list */ + if (bp->b_pages == NULL) { + bp->b_page_count = page_count; + if (page_count <= XB_PAGES) { + bp->b_pages = bp->b_page_array; + } else { + bp->b_pages = kmem_alloc(sizeof(struct page *) * + page_count, KM_NOFS); + if (bp->b_pages == NULL) + return -ENOMEM; + } + memset(bp->b_pages, 0, sizeof(struct page *) * page_count); + } + return 0; +} + +/* + * Frees b_pages if it was allocated. + */ +STATIC void +_xfs_buf_free_pages( + xfs_buf_t *bp) +{ + if (bp->b_pages != bp->b_page_array) { + kmem_free(bp->b_pages); + bp->b_pages = NULL; + } +} + +/* + * Releases the specified buffer. + * + * The modification state of any associated pages is left unchanged. + * The buffer must not be on any hash - use xfs_buf_rele instead for + * hashed and refcounted buffers + */ +void +xfs_buf_free( + xfs_buf_t *bp) +{ + trace_xfs_buf_free(bp, _RET_IP_); + + ASSERT(list_empty(&bp->b_lru)); + + if (bp->b_flags & _XBF_PAGES) { + uint i; + + if (xfs_buf_is_vmapped(bp)) + vm_unmap_ram(bp->b_addr - bp->b_offset, + bp->b_page_count); + + for (i = 0; i < bp->b_page_count; i++) { + struct page *page = bp->b_pages[i]; + + __free_page(page); + } + } else if (bp->b_flags & _XBF_KMEM) + kmem_free(bp->b_addr); + _xfs_buf_free_pages(bp); + xfs_buf_free_maps(bp); + kmem_zone_free(xfs_buf_zone, bp); +} + +/* + * Allocates all the pages for buffer in question and builds it's page list. + */ +STATIC int +xfs_buf_allocate_memory( + xfs_buf_t *bp, + uint flags) +{ + size_t size; + size_t nbytes, offset; + gfp_t gfp_mask = xb_to_gfp(flags); + unsigned short page_count, i; + xfs_off_t start, end; + int error; + + /* + * for buffers that are contained within a single page, just allocate + * the memory from the heap - there's no need for the complexity of + * page arrays to keep allocation down to order 0. + */ + size = BBTOB(bp->b_length); + if (size < PAGE_SIZE) { + bp->b_addr = kmem_alloc(size, KM_NOFS); + if (!bp->b_addr) { + /* low memory - use alloc_page loop instead */ + goto use_alloc_page; + } + + if (((unsigned long)(bp->b_addr + size - 1) & PAGE_MASK) != + ((unsigned long)bp->b_addr & PAGE_MASK)) { + /* b_addr spans two pages - use alloc_page instead */ + kmem_free(bp->b_addr); + bp->b_addr = NULL; + goto use_alloc_page; + } + bp->b_offset = offset_in_page(bp->b_addr); + bp->b_pages = bp->b_page_array; + bp->b_pages[0] = virt_to_page(bp->b_addr); + bp->b_page_count = 1; + bp->b_flags |= _XBF_KMEM; + return 0; + } + +use_alloc_page: + start = BBTOB(bp->b_maps[0].bm_bn) >> PAGE_SHIFT; + end = (BBTOB(bp->b_maps[0].bm_bn + bp->b_length) + PAGE_SIZE - 1) + >> PAGE_SHIFT; + page_count = end - start; + error = _xfs_buf_get_pages(bp, page_count); + if (unlikely(error)) + return error; + + offset = bp->b_offset; + bp->b_flags |= _XBF_PAGES; + + for (i = 0; i < bp->b_page_count; i++) { + struct page *page; + uint retries = 0; +retry: + page = alloc_page(gfp_mask); + if (unlikely(page == NULL)) { + if (flags & XBF_READ_AHEAD) { + bp->b_page_count = i; + error = -ENOMEM; + goto out_free_pages; + } + + /* + * This could deadlock. + * + * But until all the XFS lowlevel code is revamped to + * handle buffer allocation failures we can't do much. + */ + if (!(++retries % 100)) + xfs_err(NULL, + "possible memory allocation deadlock in %s (mode:0x%x)", + __func__, gfp_mask); + + XFS_STATS_INC(xb_page_retries); + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto retry; + } + + XFS_STATS_INC(xb_page_found); + + nbytes = min_t(size_t, size, PAGE_SIZE - offset); + size -= nbytes; + bp->b_pages[i] = page; + offset = 0; + } + return 0; + +out_free_pages: + for (i = 0; i < bp->b_page_count; i++) + __free_page(bp->b_pages[i]); + return error; +} + +/* + * Map buffer into kernel address-space if necessary. + */ +STATIC int +_xfs_buf_map_pages( + xfs_buf_t *bp, + uint flags) +{ + ASSERT(bp->b_flags & _XBF_PAGES); + if (bp->b_page_count == 1) { + /* A single page buffer is always mappable */ + bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset; + } else if (flags & XBF_UNMAPPED) { + bp->b_addr = NULL; + } else { + int retried = 0; + unsigned noio_flag; + + /* + * vm_map_ram() will allocate auxillary structures (e.g. + * pagetables) with GFP_KERNEL, yet we are likely to be under + * GFP_NOFS context here. Hence we need to tell memory reclaim + * that we are in such a context via PF_MEMALLOC_NOIO to prevent + * memory reclaim re-entering the filesystem here and + * potentially deadlocking. + */ + noio_flag = memalloc_noio_save(); + do { + bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count, + -1, PAGE_KERNEL); + if (bp->b_addr) + break; + vm_unmap_aliases(); + } while (retried++ <= 1); + memalloc_noio_restore(noio_flag); + + if (!bp->b_addr) + return -ENOMEM; + bp->b_addr += bp->b_offset; + } + + return 0; +} + +/* + * Finding and Reading Buffers + */ + +/* + * Look up, and creates if absent, a lockable buffer for + * a given range of an inode. The buffer is returned + * locked. No I/O is implied by this call. + */ +xfs_buf_t * +_xfs_buf_find( + struct xfs_buftarg *btp, + struct xfs_buf_map *map, + int nmaps, + xfs_buf_flags_t flags, + xfs_buf_t *new_bp) +{ + size_t numbytes; + struct xfs_perag *pag; + struct rb_node **rbp; + struct rb_node *parent; + xfs_buf_t *bp; + xfs_daddr_t blkno = map[0].bm_bn; + xfs_daddr_t eofs; + int numblks = 0; + int i; + + for (i = 0; i < nmaps; i++) + numblks += map[i].bm_len; + numbytes = BBTOB(numblks); + + /* Check for IOs smaller than the sector size / not sector aligned */ + ASSERT(!(numbytes < btp->bt_meta_sectorsize)); + ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_meta_sectormask)); + + /* + * Corrupted block numbers can get through to here, unfortunately, so we + * have to check that the buffer falls within the filesystem bounds. + */ + eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks); + if (blkno < 0 || blkno >= eofs) { + /* + * XXX (dgc): we should really be returning -EFSCORRUPTED here, + * but none of the higher level infrastructure supports + * returning a specific error on buffer lookup failures. + */ + xfs_alert(btp->bt_mount, + "%s: Block out of range: block 0x%llx, EOFS 0x%llx ", + __func__, blkno, eofs); + WARN_ON(1); + return NULL; + } + + /* get tree root */ + pag = xfs_perag_get(btp->bt_mount, + xfs_daddr_to_agno(btp->bt_mount, blkno)); + + /* walk tree */ + spin_lock(&pag->pag_buf_lock); + rbp = &pag->pag_buf_tree.rb_node; + parent = NULL; + bp = NULL; + while (*rbp) { + parent = *rbp; + bp = rb_entry(parent, struct xfs_buf, b_rbnode); + + if (blkno < bp->b_bn) + rbp = &(*rbp)->rb_left; + else if (blkno > bp->b_bn) + rbp = &(*rbp)->rb_right; + else { + /* + * found a block number match. If the range doesn't + * match, the only way this is allowed is if the buffer + * in the cache is stale and the transaction that made + * it stale has not yet committed. i.e. we are + * reallocating a busy extent. Skip this buffer and + * continue searching to the right for an exact match. + */ + if (bp->b_length != numblks) { + ASSERT(bp->b_flags & XBF_STALE); + rbp = &(*rbp)->rb_right; + continue; + } + atomic_inc(&bp->b_hold); + goto found; + } + } + + /* No match found */ + if (new_bp) { + rb_link_node(&new_bp->b_rbnode, parent, rbp); + rb_insert_color(&new_bp->b_rbnode, &pag->pag_buf_tree); + /* the buffer keeps the perag reference until it is freed */ + new_bp->b_pag = pag; + spin_unlock(&pag->pag_buf_lock); + } else { + XFS_STATS_INC(xb_miss_locked); + spin_unlock(&pag->pag_buf_lock); + xfs_perag_put(pag); + } + return new_bp; + +found: + spin_unlock(&pag->pag_buf_lock); + xfs_perag_put(pag); + + if (!xfs_buf_trylock(bp)) { + if (flags & XBF_TRYLOCK) { + xfs_buf_rele(bp); + XFS_STATS_INC(xb_busy_locked); + return NULL; + } + xfs_buf_lock(bp); + XFS_STATS_INC(xb_get_locked_waited); + } + + /* + * if the buffer is stale, clear all the external state associated with + * it. We need to keep flags such as how we allocated the buffer memory + * intact here. + */ + if (bp->b_flags & XBF_STALE) { + ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0); + ASSERT(bp->b_iodone == NULL); + bp->b_flags &= _XBF_KMEM | _XBF_PAGES; + bp->b_ops = NULL; + } + + trace_xfs_buf_find(bp, flags, _RET_IP_); + XFS_STATS_INC(xb_get_locked); + return bp; +} + +/* + * Assembles a buffer covering the specified range. The code is optimised for + * cache hits, as metadata intensive workloads will see 3 orders of magnitude + * more hits than misses. + */ +struct xfs_buf * +xfs_buf_get_map( + struct xfs_buftarg *target, + struct xfs_buf_map *map, + int nmaps, + xfs_buf_flags_t flags) +{ + struct xfs_buf *bp; + struct xfs_buf *new_bp; + int error = 0; + + bp = _xfs_buf_find(target, map, nmaps, flags, NULL); + if (likely(bp)) + goto found; + + new_bp = _xfs_buf_alloc(target, map, nmaps, flags); + if (unlikely(!new_bp)) + return NULL; + + error = xfs_buf_allocate_memory(new_bp, flags); + if (error) { + xfs_buf_free(new_bp); + return NULL; + } + + bp = _xfs_buf_find(target, map, nmaps, flags, new_bp); + if (!bp) { + xfs_buf_free(new_bp); + return NULL; + } + + if (bp != new_bp) + xfs_buf_free(new_bp); + +found: + if (!bp->b_addr) { + error = _xfs_buf_map_pages(bp, flags); + if (unlikely(error)) { + xfs_warn(target->bt_mount, + "%s: failed to map pagesn", __func__); + xfs_buf_relse(bp); + return NULL; + } + } + + XFS_STATS_INC(xb_get); + trace_xfs_buf_get(bp, flags, _RET_IP_); + return bp; +} + +STATIC int +_xfs_buf_read( + xfs_buf_t *bp, + xfs_buf_flags_t flags) +{ + ASSERT(!(flags & XBF_WRITE)); + ASSERT(bp->b_maps[0].bm_bn != XFS_BUF_DADDR_NULL); + + bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD); + bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD); + + if (flags & XBF_ASYNC) { + xfs_buf_submit(bp); + return 0; + } + return xfs_buf_submit_wait(bp); +} + +xfs_buf_t * +xfs_buf_read_map( + struct xfs_buftarg *target, + struct xfs_buf_map *map, + int nmaps, + xfs_buf_flags_t flags, + const struct xfs_buf_ops *ops) +{ + struct xfs_buf *bp; + + flags |= XBF_READ; + + bp = xfs_buf_get_map(target, map, nmaps, flags); + if (bp) { + trace_xfs_buf_read(bp, flags, _RET_IP_); + + if (!XFS_BUF_ISDONE(bp)) { + XFS_STATS_INC(xb_get_read); + bp->b_ops = ops; + _xfs_buf_read(bp, flags); + } else if (flags & XBF_ASYNC) { + /* + * Read ahead call which is already satisfied, + * drop the buffer + */ + xfs_buf_relse(bp); + return NULL; + } else { + /* We do not want read in the flags */ + bp->b_flags &= ~XBF_READ; + } + } + + return bp; +} + +/* + * If we are not low on memory then do the readahead in a deadlock + * safe manner. + */ +void +xfs_buf_readahead_map( + struct xfs_buftarg *target, + struct xfs_buf_map *map, + int nmaps, + const struct xfs_buf_ops *ops) +{ + if (bdi_read_congested(target->bt_bdi)) + return; + + xfs_buf_read_map(target, map, nmaps, + XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD, ops); +} + +/* + * Read an uncached buffer from disk. Allocates and returns a locked + * buffer containing the disk contents or nothing. + */ +int +xfs_buf_read_uncached( + struct xfs_buftarg *target, + xfs_daddr_t daddr, + size_t numblks, + int flags, + struct xfs_buf **bpp, + const struct xfs_buf_ops *ops) +{ + struct xfs_buf *bp; + + *bpp = NULL; + + bp = xfs_buf_get_uncached(target, numblks, flags); + if (!bp) + return -ENOMEM; + + /* set up the buffer for a read IO */ + ASSERT(bp->b_map_count == 1); + bp->b_bn = XFS_BUF_DADDR_NULL; /* always null for uncached buffers */ + bp->b_maps[0].bm_bn = daddr; + bp->b_flags |= XBF_READ; + bp->b_ops = ops; + + xfs_buf_submit_wait(bp); + if (bp->b_error) { + int error = bp->b_error; + xfs_buf_relse(bp); + return error; + } + + *bpp = bp; + return 0; +} + +/* + * Return a buffer allocated as an empty buffer and associated to external + * memory via xfs_buf_associate_memory() back to it's empty state. + */ +void +xfs_buf_set_empty( + struct xfs_buf *bp, + size_t numblks) +{ + if (bp->b_pages) + _xfs_buf_free_pages(bp); + + bp->b_pages = NULL; + bp->b_page_count = 0; + bp->b_addr = NULL; + bp->b_length = numblks; + bp->b_io_length = numblks; + + ASSERT(bp->b_map_count == 1); + bp->b_bn = XFS_BUF_DADDR_NULL; + bp->b_maps[0].bm_bn = XFS_BUF_DADDR_NULL; + bp->b_maps[0].bm_len = bp->b_length; +} + +static inline struct page * +mem_to_page( + void *addr) +{ + if ((!is_vmalloc_addr(addr))) { + return virt_to_page(addr); + } else { + return vmalloc_to_page(addr); + } +} + +int +xfs_buf_associate_memory( + xfs_buf_t *bp, + void *mem, + size_t len) +{ + int rval; + int i = 0; + unsigned long pageaddr; + unsigned long offset; + size_t buflen; + int page_count; + + pageaddr = (unsigned long)mem & PAGE_MASK; + offset = (unsigned long)mem - pageaddr; + buflen = PAGE_ALIGN(len + offset); + page_count = buflen >> PAGE_SHIFT; + + /* Free any previous set of page pointers */ + if (bp->b_pages) + _xfs_buf_free_pages(bp); + + bp->b_pages = NULL; + bp->b_addr = mem; + + rval = _xfs_buf_get_pages(bp, page_count); + if (rval) + return rval; + + bp->b_offset = offset; + + for (i = 0; i < bp->b_page_count; i++) { + bp->b_pages[i] = mem_to_page((void *)pageaddr); + pageaddr += PAGE_SIZE; + } + + bp->b_io_length = BTOBB(len); + bp->b_length = BTOBB(buflen); + + return 0; +} + +xfs_buf_t * +xfs_buf_get_uncached( + struct xfs_buftarg *target, + size_t numblks, + int flags) +{ + unsigned long page_count; + int error, i; + struct xfs_buf *bp; + DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks); + + bp = _xfs_buf_alloc(target, &map, 1, 0); + if (unlikely(bp == NULL)) + goto fail; + + page_count = PAGE_ALIGN(numblks << BBSHIFT) >> PAGE_SHIFT; + error = _xfs_buf_get_pages(bp, page_count); + if (error) + goto fail_free_buf; + + for (i = 0; i < page_count; i++) { + bp->b_pages[i] = alloc_page(xb_to_gfp(flags)); + if (!bp->b_pages[i]) + goto fail_free_mem; + } + bp->b_flags |= _XBF_PAGES; + + error = _xfs_buf_map_pages(bp, 0); + if (unlikely(error)) { + xfs_warn(target->bt_mount, + "%s: failed to map pages", __func__); + goto fail_free_mem; + } + + trace_xfs_buf_get_uncached(bp, _RET_IP_); + return bp; + + fail_free_mem: + while (--i >= 0) + __free_page(bp->b_pages[i]); + _xfs_buf_free_pages(bp); + fail_free_buf: + xfs_buf_free_maps(bp); + kmem_zone_free(xfs_buf_zone, bp); + fail: + return NULL; +} + +/* + * Increment reference count on buffer, to hold the buffer concurrently + * with another thread which may release (free) the buffer asynchronously. + * Must hold the buffer already to call this function. + */ +void +xfs_buf_hold( + xfs_buf_t *bp) +{ + trace_xfs_buf_hold(bp, _RET_IP_); + atomic_inc(&bp->b_hold); +} + +/* + * Releases a hold on the specified buffer. If the + * the hold count is 1, calls xfs_buf_free. + */ +void +xfs_buf_rele( + xfs_buf_t *bp) +{ + struct xfs_perag *pag = bp->b_pag; + + trace_xfs_buf_rele(bp, _RET_IP_); + + if (!pag) { + ASSERT(list_empty(&bp->b_lru)); + ASSERT(RB_EMPTY_NODE(&bp->b_rbnode)); + if (atomic_dec_and_test(&bp->b_hold)) + xfs_buf_free(bp); + return; + } + + ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode)); + + ASSERT(atomic_read(&bp->b_hold) > 0); + if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) { + spin_lock(&bp->b_lock); + if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) { + /* + * If the buffer is added to the LRU take a new + * reference to the buffer for the LRU and clear the + * (now stale) dispose list state flag + */ + if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) { + bp->b_state &= ~XFS_BSTATE_DISPOSE; + atomic_inc(&bp->b_hold); + } + spin_unlock(&bp->b_lock); + spin_unlock(&pag->pag_buf_lock); + } else { + /* + * most of the time buffers will already be removed from + * the LRU, so optimise that case by checking for the + * XFS_BSTATE_DISPOSE flag indicating the last list the + * buffer was on was the disposal list + */ + if (!(bp->b_state & XFS_BSTATE_DISPOSE)) { + list_lru_del(&bp->b_target->bt_lru, &bp->b_lru); + } else { + ASSERT(list_empty(&bp->b_lru)); + } + spin_unlock(&bp->b_lock); + + ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); + rb_erase(&bp->b_rbnode, &pag->pag_buf_tree); + spin_unlock(&pag->pag_buf_lock); + xfs_perag_put(pag); + xfs_buf_free(bp); + } + } +} + + +/* + * Lock a buffer object, if it is not already locked. + * + * If we come across a stale, pinned, locked buffer, we know that we are + * being asked to lock a buffer that has been reallocated. Because it is + * pinned, we know that the log has not been pushed to disk and hence it + * will still be locked. Rather than continuing to have trylock attempts + * fail until someone else pushes the log, push it ourselves before + * returning. This means that the xfsaild will not get stuck trying + * to push on stale inode buffers. + */ +int +xfs_buf_trylock( + struct xfs_buf *bp) +{ + int locked; + + locked = down_trylock(&bp->b_sema) == 0; + if (locked) + XB_SET_OWNER(bp); + + trace_xfs_buf_trylock(bp, _RET_IP_); + return locked; +} + +/* + * Lock a buffer object. + * + * If we come across a stale, pinned, locked buffer, we know that we + * are being asked to lock a buffer that has been reallocated. Because + * it is pinned, we know that the log has not been pushed to disk and + * hence it will still be locked. Rather than sleeping until someone + * else pushes the log, push it ourselves before trying to get the lock. + */ +void +xfs_buf_lock( + struct xfs_buf *bp) +{ + trace_xfs_buf_lock(bp, _RET_IP_); + + if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE)) + xfs_log_force(bp->b_target->bt_mount, 0); + down(&bp->b_sema); + XB_SET_OWNER(bp); + + trace_xfs_buf_lock_done(bp, _RET_IP_); +} + +void +xfs_buf_unlock( + struct xfs_buf *bp) +{ + XB_CLEAR_OWNER(bp); + up(&bp->b_sema); + + trace_xfs_buf_unlock(bp, _RET_IP_); +} + +STATIC void +xfs_buf_wait_unpin( + xfs_buf_t *bp) +{ + DECLARE_WAITQUEUE (wait, current); + + if (atomic_read(&bp->b_pin_count) == 0) + return; + + add_wait_queue(&bp->b_waiters, &wait); + for (;;) { + set_current_state(TASK_UNINTERRUPTIBLE); + if (atomic_read(&bp->b_pin_count) == 0) + break; + io_schedule(); + } + remove_wait_queue(&bp->b_waiters, &wait); + set_current_state(TASK_RUNNING); +} + +/* + * Buffer Utility Routines + */ + +void +xfs_buf_ioend( + struct xfs_buf *bp) +{ + bool read = bp->b_flags & XBF_READ; + + trace_xfs_buf_iodone(bp, _RET_IP_); + + bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD); + + /* + * Pull in IO completion errors now. We are guaranteed to be running + * single threaded, so we don't need the lock to read b_io_error. + */ + if (!bp->b_error && bp->b_io_error) + xfs_buf_ioerror(bp, bp->b_io_error); + + /* Only validate buffers that were read without errors */ + if (read && !bp->b_error && bp->b_ops) { + ASSERT(!bp->b_iodone); + bp->b_ops->verify_read(bp); + } + + if (!bp->b_error) + bp->b_flags |= XBF_DONE; + + if (bp->b_iodone) + (*(bp->b_iodone))(bp); + else if (bp->b_flags & XBF_ASYNC) + xfs_buf_relse(bp); + else + complete(&bp->b_iowait); +} + +static void +xfs_buf_ioend_work( + struct work_struct *work) +{ + struct xfs_buf *bp = + container_of(work, xfs_buf_t, b_ioend_work); + + xfs_buf_ioend(bp); +} + +void +xfs_buf_ioend_async( + struct xfs_buf *bp) +{ + INIT_WORK(&bp->b_ioend_work, xfs_buf_ioend_work); + queue_work(bp->b_ioend_wq, &bp->b_ioend_work); +} + +void +xfs_buf_ioerror( + xfs_buf_t *bp, + int error) +{ + ASSERT(error <= 0 && error >= -1000); + bp->b_error = error; + trace_xfs_buf_ioerror(bp, error, _RET_IP_); +} + +void +xfs_buf_ioerror_alert( + struct xfs_buf *bp, + const char *func) +{ + xfs_alert(bp->b_target->bt_mount, +"metadata I/O error: block 0x%llx (\"%s\") error %d numblks %d", + (__uint64_t)XFS_BUF_ADDR(bp), func, -bp->b_error, bp->b_length); +} + +int +xfs_bwrite( + struct xfs_buf *bp) +{ + int error; + + ASSERT(xfs_buf_islocked(bp)); + + bp->b_flags |= XBF_WRITE; + bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q | + XBF_WRITE_FAIL | XBF_DONE); + + error = xfs_buf_submit_wait(bp); + if (error) { + xfs_force_shutdown(bp->b_target->bt_mount, + SHUTDOWN_META_IO_ERROR); + } + return error; +} + +STATIC void +xfs_buf_bio_end_io( + struct bio *bio, + int error) +{ + xfs_buf_t *bp = (xfs_buf_t *)bio->bi_private; + + /* + * don't overwrite existing errors - otherwise we can lose errors on + * buffers that require multiple bios to complete. + */ + if (error) { + spin_lock(&bp->b_lock); + if (!bp->b_io_error) + bp->b_io_error = error; + spin_unlock(&bp->b_lock); + } + + if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ)) + invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp)); + + if (atomic_dec_and_test(&bp->b_io_remaining) == 1) + xfs_buf_ioend_async(bp); + bio_put(bio); +} + +static void +xfs_buf_ioapply_map( + struct xfs_buf *bp, + int map, + int *buf_offset, + int *count, + int rw) +{ + int page_index; + int total_nr_pages = bp->b_page_count; + int nr_pages; + struct bio *bio; + sector_t sector = bp->b_maps[map].bm_bn; + int size; + int offset; + + total_nr_pages = bp->b_page_count; + + /* skip the pages in the buffer before the start offset */ + page_index = 0; + offset = *buf_offset; + while (offset >= PAGE_SIZE) { + page_index++; + offset -= PAGE_SIZE; + } + + /* + * Limit the IO size to the length of the current vector, and update the + * remaining IO count for the next time around. + */ + size = min_t(int, BBTOB(bp->b_maps[map].bm_len), *count); + *count -= size; + *buf_offset += size; + +next_chunk: + atomic_inc(&bp->b_io_remaining); + nr_pages = BIO_MAX_SECTORS >> (PAGE_SHIFT - BBSHIFT); + if (nr_pages > total_nr_pages) + nr_pages = total_nr_pages; + + bio = bio_alloc(GFP_NOIO, nr_pages); + bio->bi_bdev = bp->b_target->bt_bdev; + bio->bi_iter.bi_sector = sector; + bio->bi_end_io = xfs_buf_bio_end_io; + bio->bi_private = bp; + + + for (; size && nr_pages; nr_pages--, page_index++) { + int rbytes, nbytes = PAGE_SIZE - offset; + + if (nbytes > size) + nbytes = size; + + rbytes = bio_add_page(bio, bp->b_pages[page_index], nbytes, + offset); + if (rbytes < nbytes) + break; + + offset = 0; + sector += BTOBB(nbytes); + size -= nbytes; + total_nr_pages--; + } + + if (likely(bio->bi_iter.bi_size)) { + if (xfs_buf_is_vmapped(bp)) { + flush_kernel_vmap_range(bp->b_addr, + xfs_buf_vmap_len(bp)); + } + submit_bio(rw, bio); + if (size) + goto next_chunk; + } else { + /* + * This is guaranteed not to be the last io reference count + * because the caller (xfs_buf_submit) holds a count itself. + */ + atomic_dec(&bp->b_io_remaining); + xfs_buf_ioerror(bp, -EIO); + bio_put(bio); + } + +} + +STATIC void +_xfs_buf_ioapply( + struct xfs_buf *bp) +{ + struct blk_plug plug; + int rw; + int offset; + int size; + int i; + + /* + * Make sure we capture only current IO errors rather than stale errors + * left over from previous use of the buffer (e.g. failed readahead). + */ + bp->b_error = 0; + + /* + * Initialize the I/O completion workqueue if we haven't yet or the + * submitter has not opted to specify a custom one. + */ + if (!bp->b_ioend_wq) + bp->b_ioend_wq = bp->b_target->bt_mount->m_buf_workqueue; + + if (bp->b_flags & XBF_WRITE) { + if (bp->b_flags & XBF_SYNCIO) + rw = WRITE_SYNC; + else + rw = WRITE; + if (bp->b_flags & XBF_FUA) + rw |= REQ_FUA; + if (bp->b_flags & XBF_FLUSH) + rw |= REQ_FLUSH; + + /* + * Run the write verifier callback function if it exists. If + * this function fails it will mark the buffer with an error and + * the IO should not be dispatched. + */ + if (bp->b_ops) { + bp->b_ops->verify_write(bp); + if (bp->b_error) { + xfs_force_shutdown(bp->b_target->bt_mount, + SHUTDOWN_CORRUPT_INCORE); + return; + } + } else if (bp->b_bn != XFS_BUF_DADDR_NULL) { + struct xfs_mount *mp = bp->b_target->bt_mount; + + /* + * non-crc filesystems don't attach verifiers during + * log recovery, so don't warn for such filesystems. + */ + if (xfs_sb_version_hascrc(&mp->m_sb)) { + xfs_warn(mp, + "%s: no ops on block 0x%llx/0x%x", + __func__, bp->b_bn, bp->b_length); + xfs_hex_dump(bp->b_addr, 64); + dump_stack(); + } + } + } else if (bp->b_flags & XBF_READ_AHEAD) { + rw = READA; + } else { + rw = READ; + } + + /* we only use the buffer cache for meta-data */ + rw |= REQ_META; + + /* + * Walk all the vectors issuing IO on them. Set up the initial offset + * into the buffer and the desired IO size before we start - + * _xfs_buf_ioapply_vec() will modify them appropriately for each + * subsequent call. + */ + offset = bp->b_offset; + size = BBTOB(bp->b_io_length); + blk_start_plug(&plug); + for (i = 0; i < bp->b_map_count; i++) { + xfs_buf_ioapply_map(bp, i, &offset, &size, rw); + if (bp->b_error) + break; + if (size <= 0) + break; /* all done */ + } + blk_finish_plug(&plug); +} + +/* + * Asynchronous IO submission path. This transfers the buffer lock ownership and + * the current reference to the IO. It is not safe to reference the buffer after + * a call to this function unless the caller holds an additional reference + * itself. + */ +void +xfs_buf_submit( + struct xfs_buf *bp) +{ + trace_xfs_buf_submit(bp, _RET_IP_); + + ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); + ASSERT(bp->b_flags & XBF_ASYNC); + + /* on shutdown we stale and complete the buffer immediately */ + if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) { + xfs_buf_ioerror(bp, -EIO); + bp->b_flags &= ~XBF_DONE; + xfs_buf_stale(bp); + xfs_buf_ioend(bp); + return; + } + + if (bp->b_flags & XBF_WRITE) + xfs_buf_wait_unpin(bp); + + /* clear the internal error state to avoid spurious errors */ + bp->b_io_error = 0; + + /* + * The caller's reference is released during I/O completion. + * This occurs some time after the last b_io_remaining reference is + * released, so after we drop our Io reference we have to have some + * other reference to ensure the buffer doesn't go away from underneath + * us. Take a direct reference to ensure we have safe access to the + * buffer until we are finished with it. + */ + xfs_buf_hold(bp); + + /* + * Set the count to 1 initially, this will stop an I/O completion + * callout which happens before we have started all the I/O from calling + * xfs_buf_ioend too early. + */ + atomic_set(&bp->b_io_remaining, 1); + _xfs_buf_ioapply(bp); + + /* + * If _xfs_buf_ioapply failed, we can get back here with only the IO + * reference we took above. If we drop it to zero, run completion so + * that we don't return to the caller with completion still pending. + */ + if (atomic_dec_and_test(&bp->b_io_remaining) == 1) { + if (bp->b_error) + xfs_buf_ioend(bp); + else + xfs_buf_ioend_async(bp); + } + + xfs_buf_rele(bp); + /* Note: it is not safe to reference bp now we've dropped our ref */ +} + +/* + * Synchronous buffer IO submission path, read or write. + */ +int +xfs_buf_submit_wait( + struct xfs_buf *bp) +{ + int error; + + trace_xfs_buf_submit_wait(bp, _RET_IP_); + + ASSERT(!(bp->b_flags & (_XBF_DELWRI_Q | XBF_ASYNC))); + + if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) { + xfs_buf_ioerror(bp, -EIO); + xfs_buf_stale(bp); + bp->b_flags &= ~XBF_DONE; + return -EIO; + } + + if (bp->b_flags & XBF_WRITE) + xfs_buf_wait_unpin(bp); + + /* clear the internal error state to avoid spurious errors */ + bp->b_io_error = 0; + + /* + * For synchronous IO, the IO does not inherit the submitters reference + * count, nor the buffer lock. Hence we cannot release the reference we + * are about to take until we've waited for all IO completion to occur, + * including any xfs_buf_ioend_async() work that may be pending. + */ + xfs_buf_hold(bp); + + /* + * Set the count to 1 initially, this will stop an I/O completion + * callout which happens before we have started all the I/O from calling + * xfs_buf_ioend too early. + */ + atomic_set(&bp->b_io_remaining, 1); + _xfs_buf_ioapply(bp); + + /* + * make sure we run completion synchronously if it raced with us and is + * already complete. + */ + if (atomic_dec_and_test(&bp->b_io_remaining) == 1) + xfs_buf_ioend(bp); + + /* wait for completion before gathering the error from the buffer */ + trace_xfs_buf_iowait(bp, _RET_IP_); + wait_for_completion(&bp->b_iowait); + trace_xfs_buf_iowait_done(bp, _RET_IP_); + error = bp->b_error; + + /* + * all done now, we can release the hold that keeps the buffer + * referenced for the entire IO. + */ + xfs_buf_rele(bp); + return error; +} + +xfs_caddr_t +xfs_buf_offset( + xfs_buf_t *bp, + size_t offset) +{ + struct page *page; + + if (bp->b_addr) + return bp->b_addr + offset; + + offset += bp->b_offset; + page = bp->b_pages[offset >> PAGE_SHIFT]; + return (xfs_caddr_t)page_address(page) + (offset & (PAGE_SIZE-1)); +} + +/* + * Move data into or out of a buffer. + */ +void +xfs_buf_iomove( + xfs_buf_t *bp, /* buffer to process */ + size_t boff, /* starting buffer offset */ + size_t bsize, /* length to copy */ + void *data, /* data address */ + xfs_buf_rw_t mode) /* read/write/zero flag */ +{ + size_t bend; + + bend = boff + bsize; + while (boff < bend) { + struct page *page; + int page_index, page_offset, csize; + + page_index = (boff + bp->b_offset) >> PAGE_SHIFT; + page_offset = (boff + bp->b_offset) & ~PAGE_MASK; + page = bp->b_pages[page_index]; + csize = min_t(size_t, PAGE_SIZE - page_offset, + BBTOB(bp->b_io_length) - boff); + + ASSERT((csize + page_offset) <= PAGE_SIZE); + + switch (mode) { + case XBRW_ZERO: + memset(page_address(page) + page_offset, 0, csize); + break; + case XBRW_READ: + memcpy(data, page_address(page) + page_offset, csize); + break; + case XBRW_WRITE: + memcpy(page_address(page) + page_offset, data, csize); + } + + boff += csize; + data += csize; + } +} + +/* + * Handling of buffer targets (buftargs). + */ + +/* + * Wait for any bufs with callbacks that have been submitted but have not yet + * returned. These buffers will have an elevated hold count, so wait on those + * while freeing all the buffers only held by the LRU. + */ +static enum lru_status +xfs_buftarg_wait_rele( + struct list_head *item, + struct list_lru_one *lru, + spinlock_t *lru_lock, + void *arg) + +{ + struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru); + struct list_head *dispose = arg; + + if (atomic_read(&bp->b_hold) > 1) { + /* need to wait, so skip it this pass */ + trace_xfs_buf_wait_buftarg(bp, _RET_IP_); + return LRU_SKIP; + } + if (!spin_trylock(&bp->b_lock)) + return LRU_SKIP; + + /* + * clear the LRU reference count so the buffer doesn't get + * ignored in xfs_buf_rele(). + */ + atomic_set(&bp->b_lru_ref, 0); + bp->b_state |= XFS_BSTATE_DISPOSE; + list_lru_isolate_move(lru, item, dispose); + spin_unlock(&bp->b_lock); + return LRU_REMOVED; +} + +void +xfs_wait_buftarg( + struct xfs_buftarg *btp) +{ + LIST_HEAD(dispose); + int loop = 0; + + /* loop until there is nothing left on the lru list. */ + while (list_lru_count(&btp->bt_lru)) { + list_lru_walk(&btp->bt_lru, xfs_buftarg_wait_rele, + &dispose, LONG_MAX); + + while (!list_empty(&dispose)) { + struct xfs_buf *bp; + bp = list_first_entry(&dispose, struct xfs_buf, b_lru); + list_del_init(&bp->b_lru); + if (bp->b_flags & XBF_WRITE_FAIL) { + xfs_alert(btp->bt_mount, +"Corruption Alert: Buffer at block 0x%llx had permanent write failures!\n" +"Please run xfs_repair to determine the extent of the problem.", + (long long)bp->b_bn); + } + xfs_buf_rele(bp); + } + if (loop++ != 0) + delay(100); + } +} + +static enum lru_status +xfs_buftarg_isolate( + struct list_head *item, + struct list_lru_one *lru, + spinlock_t *lru_lock, + void *arg) +{ + struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru); + struct list_head *dispose = arg; + + /* + * we are inverting the lru lock/bp->b_lock here, so use a trylock. + * If we fail to get the lock, just skip it. + */ + if (!spin_trylock(&bp->b_lock)) + return LRU_SKIP; + /* + * Decrement the b_lru_ref count unless the value is already + * zero. If the value is already zero, we need to reclaim the + * buffer, otherwise it gets another trip through the LRU. + */ + if (!atomic_add_unless(&bp->b_lru_ref, -1, 0)) { + spin_unlock(&bp->b_lock); + return LRU_ROTATE; + } + + bp->b_state |= XFS_BSTATE_DISPOSE; + list_lru_isolate_move(lru, item, dispose); + spin_unlock(&bp->b_lock); + return LRU_REMOVED; +} + +static unsigned long +xfs_buftarg_shrink_scan( + struct shrinker *shrink, + struct shrink_control *sc) +{ + struct xfs_buftarg *btp = container_of(shrink, + struct xfs_buftarg, bt_shrinker); + LIST_HEAD(dispose); + unsigned long freed; + + freed = list_lru_shrink_walk(&btp->bt_lru, sc, + xfs_buftarg_isolate, &dispose); + + while (!list_empty(&dispose)) { + struct xfs_buf *bp; + bp = list_first_entry(&dispose, struct xfs_buf, b_lru); + list_del_init(&bp->b_lru); + xfs_buf_rele(bp); + } + + return freed; +} + +static unsigned long +xfs_buftarg_shrink_count( + struct shrinker *shrink, + struct shrink_control *sc) +{ + struct xfs_buftarg *btp = container_of(shrink, + struct xfs_buftarg, bt_shrinker); + return list_lru_shrink_count(&btp->bt_lru, sc); +} + +void +xfs_free_buftarg( + struct xfs_mount *mp, + struct xfs_buftarg *btp) +{ + unregister_shrinker(&btp->bt_shrinker); + list_lru_destroy(&btp->bt_lru); + + if (mp->m_flags & XFS_MOUNT_BARRIER) + xfs_blkdev_issue_flush(btp); + + kmem_free(btp); +} + +int +xfs_setsize_buftarg( + xfs_buftarg_t *btp, + unsigned int sectorsize) +{ + /* Set up metadata sector size info */ + btp->bt_meta_sectorsize = sectorsize; + btp->bt_meta_sectormask = sectorsize - 1; + + if (set_blocksize(btp->bt_bdev, sectorsize)) { + char name[BDEVNAME_SIZE]; + + bdevname(btp->bt_bdev, name); + + xfs_warn(btp->bt_mount, + "Cannot set_blocksize to %u on device %s", + sectorsize, name); + return -EINVAL; + } + + /* Set up device logical sector size mask */ + btp->bt_logical_sectorsize = bdev_logical_block_size(btp->bt_bdev); + btp->bt_logical_sectormask = bdev_logical_block_size(btp->bt_bdev) - 1; + + return 0; +} + +/* + * When allocating the initial buffer target we have not yet + * read in the superblock, so don't know what sized sectors + * are being used at this early stage. Play safe. + */ +STATIC int +xfs_setsize_buftarg_early( + xfs_buftarg_t *btp, + struct block_device *bdev) +{ + return xfs_setsize_buftarg(btp, bdev_logical_block_size(bdev)); +} + +xfs_buftarg_t * +xfs_alloc_buftarg( + struct xfs_mount *mp, + struct block_device *bdev) +{ + xfs_buftarg_t *btp; + + btp = kmem_zalloc(sizeof(*btp), KM_SLEEP | KM_NOFS); + + btp->bt_mount = mp; + btp->bt_dev = bdev->bd_dev; + btp->bt_bdev = bdev; + btp->bt_bdi = blk_get_backing_dev_info(bdev); + + if (xfs_setsize_buftarg_early(btp, bdev)) + goto error; + + if (list_lru_init(&btp->bt_lru)) + goto error; + + btp->bt_shrinker.count_objects = xfs_buftarg_shrink_count; + btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan; + btp->bt_shrinker.seeks = DEFAULT_SEEKS; + btp->bt_shrinker.flags = SHRINKER_NUMA_AWARE; + register_shrinker(&btp->bt_shrinker); + return btp; + +error: + kmem_free(btp); + return NULL; +} + +/* + * Add a buffer to the delayed write list. + * + * This queues a buffer for writeout if it hasn't already been. Note that + * neither this routine nor the buffer list submission functions perform + * any internal synchronization. It is expected that the lists are thread-local + * to the callers. + * + * Returns true if we queued up the buffer, or false if it already had + * been on the buffer list. + */ +bool +xfs_buf_delwri_queue( + struct xfs_buf *bp, + struct list_head *list) +{ + ASSERT(xfs_buf_islocked(bp)); + ASSERT(!(bp->b_flags & XBF_READ)); + + /* + * If the buffer is already marked delwri it already is queued up + * by someone else for imediate writeout. Just ignore it in that + * case. + */ + if (bp->b_flags & _XBF_DELWRI_Q) { + trace_xfs_buf_delwri_queued(bp, _RET_IP_); + return false; + } + + trace_xfs_buf_delwri_queue(bp, _RET_IP_); + + /* + * If a buffer gets written out synchronously or marked stale while it + * is on a delwri list we lazily remove it. To do this, the other party + * clears the _XBF_DELWRI_Q flag but otherwise leaves the buffer alone. + * It remains referenced and on the list. In a rare corner case it + * might get readded to a delwri list after the synchronous writeout, in + * which case we need just need to re-add the flag here. + */ + bp->b_flags |= _XBF_DELWRI_Q; + if (list_empty(&bp->b_list)) { + atomic_inc(&bp->b_hold); + list_add_tail(&bp->b_list, list); + } + + return true; +} + +/* + * Compare function is more complex than it needs to be because + * the return value is only 32 bits and we are doing comparisons + * on 64 bit values + */ +static int +xfs_buf_cmp( + void *priv, + struct list_head *a, + struct list_head *b) +{ + struct xfs_buf *ap = container_of(a, struct xfs_buf, b_list); + struct xfs_buf *bp = container_of(b, struct xfs_buf, b_list); + xfs_daddr_t diff; + + diff = ap->b_maps[0].bm_bn - bp->b_maps[0].bm_bn; + if (diff < 0) + return -1; + if (diff > 0) + return 1; + return 0; +} + +static int +__xfs_buf_delwri_submit( + struct list_head *buffer_list, + struct list_head *io_list, + bool wait) +{ + struct blk_plug plug; + struct xfs_buf *bp, *n; + int pinned = 0; + + list_for_each_entry_safe(bp, n, buffer_list, b_list) { + if (!wait) { + if (xfs_buf_ispinned(bp)) { + pinned++; + continue; + } + if (!xfs_buf_trylock(bp)) + continue; + } else { + xfs_buf_lock(bp); + } + + /* + * Someone else might have written the buffer synchronously or + * marked it stale in the meantime. In that case only the + * _XBF_DELWRI_Q flag got cleared, and we have to drop the + * reference and remove it from the list here. + */ + if (!(bp->b_flags & _XBF_DELWRI_Q)) { + list_del_init(&bp->b_list); + xfs_buf_relse(bp); + continue; + } + + list_move_tail(&bp->b_list, io_list); + trace_xfs_buf_delwri_split(bp, _RET_IP_); + } + + list_sort(NULL, io_list, xfs_buf_cmp); + + blk_start_plug(&plug); + list_for_each_entry_safe(bp, n, io_list, b_list) { + bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC | XBF_WRITE_FAIL); + bp->b_flags |= XBF_WRITE | XBF_ASYNC; + + /* + * we do all Io submission async. This means if we need to wait + * for IO completion we need to take an extra reference so the + * buffer is still valid on the other side. + */ + if (wait) + xfs_buf_hold(bp); + else + list_del_init(&bp->b_list); + + xfs_buf_submit(bp); + } + blk_finish_plug(&plug); + + return pinned; +} + +/* + * Write out a buffer list asynchronously. + * + * This will take the @buffer_list, write all non-locked and non-pinned buffers + * out and not wait for I/O completion on any of the buffers. This interface + * is only safely useable for callers that can track I/O completion by higher + * level means, e.g. AIL pushing as the @buffer_list is consumed in this + * function. + */ +int +xfs_buf_delwri_submit_nowait( + struct list_head *buffer_list) +{ + LIST_HEAD (io_list); + return __xfs_buf_delwri_submit(buffer_list, &io_list, false); +} + +/* + * Write out a buffer list synchronously. + * + * This will take the @buffer_list, write all buffers out and wait for I/O + * completion on all of the buffers. @buffer_list is consumed by the function, + * so callers must have some other way of tracking buffers if they require such + * functionality. + */ +int +xfs_buf_delwri_submit( + struct list_head *buffer_list) +{ + LIST_HEAD (io_list); + int error = 0, error2; + struct xfs_buf *bp; + + __xfs_buf_delwri_submit(buffer_list, &io_list, true); + + /* Wait for IO to complete. */ + while (!list_empty(&io_list)) { + bp = list_first_entry(&io_list, struct xfs_buf, b_list); + + list_del_init(&bp->b_list); + + /* locking the buffer will wait for async IO completion. */ + xfs_buf_lock(bp); + error2 = bp->b_error; + xfs_buf_relse(bp); + if (!error) + error = error2; + } + + return error; +} + +int __init +xfs_buf_init(void) +{ + xfs_buf_zone = kmem_zone_init_flags(sizeof(xfs_buf_t), "xfs_buf", + KM_ZONE_HWALIGN, NULL); + if (!xfs_buf_zone) + goto out; + + return 0; + + out: + return -ENOMEM; +} + +void +xfs_buf_terminate(void) +{ + kmem_zone_destroy(xfs_buf_zone); +} diff --git a/kernel/fs/xfs/xfs_buf.h b/kernel/fs/xfs/xfs_buf.h new file mode 100644 index 000000000..75ff5d5a7 --- /dev/null +++ b/kernel/fs/xfs/xfs_buf.h @@ -0,0 +1,393 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_BUF_H__ +#define __XFS_BUF_H__ + +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Base types + */ + +#define XFS_BUF_DADDR_NULL ((xfs_daddr_t) (-1LL)) + +typedef enum { + XBRW_READ = 1, /* transfer into target memory */ + XBRW_WRITE = 2, /* transfer from target memory */ + XBRW_ZERO = 3, /* Zero target memory */ +} xfs_buf_rw_t; + +#define XBF_READ (1 << 0) /* buffer intended for reading from device */ +#define XBF_WRITE (1 << 1) /* buffer intended for writing to device */ +#define XBF_READ_AHEAD (1 << 2) /* asynchronous read-ahead */ +#define XBF_ASYNC (1 << 4) /* initiator will not wait for completion */ +#define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */ +#define XBF_STALE (1 << 6) /* buffer has been staled, do not find it */ +#define XBF_WRITE_FAIL (1 << 24)/* async writes have failed on this buffer */ + +/* I/O hints for the BIO layer */ +#define XBF_SYNCIO (1 << 10)/* treat this buffer as synchronous I/O */ +#define XBF_FUA (1 << 11)/* force cache write through mode */ +#define XBF_FLUSH (1 << 12)/* flush the disk cache before a write */ + +/* flags used only as arguments to access routines */ +#define XBF_TRYLOCK (1 << 16)/* lock requested, but do not wait */ +#define XBF_UNMAPPED (1 << 17)/* do not map the buffer */ + +/* flags used only internally */ +#define _XBF_PAGES (1 << 20)/* backed by refcounted pages */ +#define _XBF_KMEM (1 << 21)/* backed by heap memory */ +#define _XBF_DELWRI_Q (1 << 22)/* buffer on a delwri queue */ +#define _XBF_COMPOUND (1 << 23)/* compound buffer */ + +typedef unsigned int xfs_buf_flags_t; + +#define XFS_BUF_FLAGS \ + { XBF_READ, "READ" }, \ + { XBF_WRITE, "WRITE" }, \ + { XBF_READ_AHEAD, "READ_AHEAD" }, \ + { XBF_ASYNC, "ASYNC" }, \ + { XBF_DONE, "DONE" }, \ + { XBF_STALE, "STALE" }, \ + { XBF_WRITE_FAIL, "WRITE_FAIL" }, \ + { XBF_SYNCIO, "SYNCIO" }, \ + { XBF_FUA, "FUA" }, \ + { XBF_FLUSH, "FLUSH" }, \ + { XBF_TRYLOCK, "TRYLOCK" }, /* should never be set */\ + { XBF_UNMAPPED, "UNMAPPED" }, /* ditto */\ + { _XBF_PAGES, "PAGES" }, \ + { _XBF_KMEM, "KMEM" }, \ + { _XBF_DELWRI_Q, "DELWRI_Q" }, \ + { _XBF_COMPOUND, "COMPOUND" } + + +/* + * Internal state flags. + */ +#define XFS_BSTATE_DISPOSE (1 << 0) /* buffer being discarded */ + +/* + * The xfs_buftarg contains 2 notions of "sector size" - + * + * 1) The metadata sector size, which is the minimum unit and + * alignment of IO which will be performed by metadata operations. + * 2) The device logical sector size + * + * The first is specified at mkfs time, and is stored on-disk in the + * superblock's sb_sectsize. + * + * The latter is derived from the underlying device, and controls direct IO + * alignment constraints. + */ +typedef struct xfs_buftarg { + dev_t bt_dev; + struct block_device *bt_bdev; + struct backing_dev_info *bt_bdi; + struct xfs_mount *bt_mount; + unsigned int bt_meta_sectorsize; + size_t bt_meta_sectormask; + size_t bt_logical_sectorsize; + size_t bt_logical_sectormask; + + /* LRU control structures */ + struct shrinker bt_shrinker; + struct list_lru bt_lru; +} xfs_buftarg_t; + +struct xfs_buf; +typedef void (*xfs_buf_iodone_t)(struct xfs_buf *); + + +#define XB_PAGES 2 + +struct xfs_buf_map { + xfs_daddr_t bm_bn; /* block number for I/O */ + int bm_len; /* size of I/O */ +}; + +#define DEFINE_SINGLE_BUF_MAP(map, blkno, numblk) \ + struct xfs_buf_map (map) = { .bm_bn = (blkno), .bm_len = (numblk) }; + +struct xfs_buf_ops { + void (*verify_read)(struct xfs_buf *); + void (*verify_write)(struct xfs_buf *); +}; + +typedef struct xfs_buf { + /* + * first cacheline holds all the fields needed for an uncontended cache + * hit to be fully processed. The semaphore straddles the cacheline + * boundary, but the counter and lock sits on the first cacheline, + * which is the only bit that is touched if we hit the semaphore + * fast-path on locking. + */ + struct rb_node b_rbnode; /* rbtree node */ + xfs_daddr_t b_bn; /* block number of buffer */ + int b_length; /* size of buffer in BBs */ + atomic_t b_hold; /* reference count */ + atomic_t b_lru_ref; /* lru reclaim ref count */ + xfs_buf_flags_t b_flags; /* status flags */ + struct semaphore b_sema; /* semaphore for lockables */ + + /* + * concurrent access to b_lru and b_lru_flags are protected by + * bt_lru_lock and not by b_sema + */ + struct list_head b_lru; /* lru list */ + spinlock_t b_lock; /* internal state lock */ + unsigned int b_state; /* internal state flags */ + int b_io_error; /* internal IO error state */ + wait_queue_head_t b_waiters; /* unpin waiters */ + struct list_head b_list; + struct xfs_perag *b_pag; /* contains rbtree root */ + xfs_buftarg_t *b_target; /* buffer target (device) */ + void *b_addr; /* virtual address of buffer */ + struct work_struct b_ioend_work; + struct workqueue_struct *b_ioend_wq; /* I/O completion wq */ + xfs_buf_iodone_t b_iodone; /* I/O completion function */ + struct completion b_iowait; /* queue for I/O waiters */ + void *b_fspriv; + struct xfs_trans *b_transp; + struct page **b_pages; /* array of page pointers */ + struct page *b_page_array[XB_PAGES]; /* inline pages */ + struct xfs_buf_map *b_maps; /* compound buffer map */ + struct xfs_buf_map __b_map; /* inline compound buffer map */ + int b_map_count; + int b_io_length; /* IO size in BBs */ + atomic_t b_pin_count; /* pin count */ + atomic_t b_io_remaining; /* #outstanding I/O requests */ + unsigned int b_page_count; /* size of page array */ + unsigned int b_offset; /* page offset in first page */ + int b_error; /* error code on I/O */ + const struct xfs_buf_ops *b_ops; + +#ifdef XFS_BUF_LOCK_TRACKING + int b_last_holder; +#endif +} xfs_buf_t; + +/* Finding and Reading Buffers */ +struct xfs_buf *_xfs_buf_find(struct xfs_buftarg *target, + struct xfs_buf_map *map, int nmaps, + xfs_buf_flags_t flags, struct xfs_buf *new_bp); + +static inline struct xfs_buf * +xfs_incore( + struct xfs_buftarg *target, + xfs_daddr_t blkno, + size_t numblks, + xfs_buf_flags_t flags) +{ + DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); + return _xfs_buf_find(target, &map, 1, flags, NULL); +} + +struct xfs_buf *_xfs_buf_alloc(struct xfs_buftarg *target, + struct xfs_buf_map *map, int nmaps, + xfs_buf_flags_t flags); + +static inline struct xfs_buf * +xfs_buf_alloc( + struct xfs_buftarg *target, + xfs_daddr_t blkno, + size_t numblks, + xfs_buf_flags_t flags) +{ + DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); + return _xfs_buf_alloc(target, &map, 1, flags); +} + +struct xfs_buf *xfs_buf_get_map(struct xfs_buftarg *target, + struct xfs_buf_map *map, int nmaps, + xfs_buf_flags_t flags); +struct xfs_buf *xfs_buf_read_map(struct xfs_buftarg *target, + struct xfs_buf_map *map, int nmaps, + xfs_buf_flags_t flags, + const struct xfs_buf_ops *ops); +void xfs_buf_readahead_map(struct xfs_buftarg *target, + struct xfs_buf_map *map, int nmaps, + const struct xfs_buf_ops *ops); + +static inline struct xfs_buf * +xfs_buf_get( + struct xfs_buftarg *target, + xfs_daddr_t blkno, + size_t numblks, + xfs_buf_flags_t flags) +{ + DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); + return xfs_buf_get_map(target, &map, 1, flags); +} + +static inline struct xfs_buf * +xfs_buf_read( + struct xfs_buftarg *target, + xfs_daddr_t blkno, + size_t numblks, + xfs_buf_flags_t flags, + const struct xfs_buf_ops *ops) +{ + DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); + return xfs_buf_read_map(target, &map, 1, flags, ops); +} + +static inline void +xfs_buf_readahead( + struct xfs_buftarg *target, + xfs_daddr_t blkno, + size_t numblks, + const struct xfs_buf_ops *ops) +{ + DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); + return xfs_buf_readahead_map(target, &map, 1, ops); +} + +struct xfs_buf *xfs_buf_get_empty(struct xfs_buftarg *target, size_t numblks); +void xfs_buf_set_empty(struct xfs_buf *bp, size_t numblks); +int xfs_buf_associate_memory(struct xfs_buf *bp, void *mem, size_t length); + +struct xfs_buf *xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks, + int flags); +int xfs_buf_read_uncached(struct xfs_buftarg *target, xfs_daddr_t daddr, + size_t numblks, int flags, struct xfs_buf **bpp, + const struct xfs_buf_ops *ops); +void xfs_buf_hold(struct xfs_buf *bp); + +/* Releasing Buffers */ +extern void xfs_buf_free(xfs_buf_t *); +extern void xfs_buf_rele(xfs_buf_t *); + +/* Locking and Unlocking Buffers */ +extern int xfs_buf_trylock(xfs_buf_t *); +extern void xfs_buf_lock(xfs_buf_t *); +extern void xfs_buf_unlock(xfs_buf_t *); +#define xfs_buf_islocked(bp) \ + ((bp)->b_sema.count <= 0) + +/* Buffer Read and Write Routines */ +extern int xfs_bwrite(struct xfs_buf *bp); +extern void xfs_buf_ioend(struct xfs_buf *bp); +extern void xfs_buf_ioerror(xfs_buf_t *, int); +extern void xfs_buf_ioerror_alert(struct xfs_buf *, const char *func); +extern void xfs_buf_submit(struct xfs_buf *bp); +extern int xfs_buf_submit_wait(struct xfs_buf *bp); +extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *, + xfs_buf_rw_t); +#define xfs_buf_zero(bp, off, len) \ + xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO) + +/* Buffer Utility Routines */ +extern xfs_caddr_t xfs_buf_offset(xfs_buf_t *, size_t); + +/* Delayed Write Buffer Routines */ +extern bool xfs_buf_delwri_queue(struct xfs_buf *, struct list_head *); +extern int xfs_buf_delwri_submit(struct list_head *); +extern int xfs_buf_delwri_submit_nowait(struct list_head *); + +/* Buffer Daemon Setup Routines */ +extern int xfs_buf_init(void); +extern void xfs_buf_terminate(void); + +#define XFS_BUF_ZEROFLAGS(bp) \ + ((bp)->b_flags &= ~(XBF_READ|XBF_WRITE|XBF_ASYNC| \ + XBF_SYNCIO|XBF_FUA|XBF_FLUSH| \ + XBF_WRITE_FAIL)) + +void xfs_buf_stale(struct xfs_buf *bp); +#define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XBF_STALE) +#define XFS_BUF_ISSTALE(bp) ((bp)->b_flags & XBF_STALE) + +#define XFS_BUF_DONE(bp) ((bp)->b_flags |= XBF_DONE) +#define XFS_BUF_UNDONE(bp) ((bp)->b_flags &= ~XBF_DONE) +#define XFS_BUF_ISDONE(bp) ((bp)->b_flags & XBF_DONE) + +#define XFS_BUF_ASYNC(bp) ((bp)->b_flags |= XBF_ASYNC) +#define XFS_BUF_UNASYNC(bp) ((bp)->b_flags &= ~XBF_ASYNC) +#define XFS_BUF_ISASYNC(bp) ((bp)->b_flags & XBF_ASYNC) + +#define XFS_BUF_READ(bp) ((bp)->b_flags |= XBF_READ) +#define XFS_BUF_UNREAD(bp) ((bp)->b_flags &= ~XBF_READ) +#define XFS_BUF_ISREAD(bp) ((bp)->b_flags & XBF_READ) + +#define XFS_BUF_WRITE(bp) ((bp)->b_flags |= XBF_WRITE) +#define XFS_BUF_UNWRITE(bp) ((bp)->b_flags &= ~XBF_WRITE) +#define XFS_BUF_ISWRITE(bp) ((bp)->b_flags & XBF_WRITE) + +/* + * These macros use the IO block map rather than b_bn. b_bn is now really + * just for the buffer cache index for cached buffers. As IO does not use b_bn + * anymore, uncached buffers do not use b_bn at all and hence must modify the IO + * map directly. Uncached buffers are not allowed to be discontiguous, so this + * is safe to do. + * + * In future, uncached buffers will pass the block number directly to the io + * request function and hence these macros will go away at that point. + */ +#define XFS_BUF_ADDR(bp) ((bp)->b_maps[0].bm_bn) +#define XFS_BUF_SET_ADDR(bp, bno) ((bp)->b_maps[0].bm_bn = (xfs_daddr_t)(bno)) + +static inline void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref) +{ + atomic_set(&bp->b_lru_ref, lru_ref); +} + +static inline int xfs_buf_ispinned(struct xfs_buf *bp) +{ + return atomic_read(&bp->b_pin_count); +} + +static inline void xfs_buf_relse(xfs_buf_t *bp) +{ + xfs_buf_unlock(bp); + xfs_buf_rele(bp); +} + +static inline int +xfs_buf_verify_cksum(struct xfs_buf *bp, unsigned long cksum_offset) +{ + return xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), + cksum_offset); +} + +static inline void +xfs_buf_update_cksum(struct xfs_buf *bp, unsigned long cksum_offset) +{ + xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), + cksum_offset); +} + +/* + * Handling of buftargs. + */ +extern xfs_buftarg_t *xfs_alloc_buftarg(struct xfs_mount *, + struct block_device *); +extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *); +extern void xfs_wait_buftarg(xfs_buftarg_t *); +extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int); + +#define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev) +#define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev) + +#endif /* __XFS_BUF_H__ */ diff --git a/kernel/fs/xfs/xfs_buf_item.c b/kernel/fs/xfs/xfs_buf_item.c new file mode 100644 index 000000000..092d652bc --- /dev/null +++ b/kernel/fs/xfs/xfs_buf_item.c @@ -0,0 +1,1155 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_trans.h" +#include "xfs_buf_item.h" +#include "xfs_trans_priv.h" +#include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_log.h" + + +kmem_zone_t *xfs_buf_item_zone; + +static inline struct xfs_buf_log_item *BUF_ITEM(struct xfs_log_item *lip) +{ + return container_of(lip, struct xfs_buf_log_item, bli_item); +} + +STATIC void xfs_buf_do_callbacks(struct xfs_buf *bp); + +static inline int +xfs_buf_log_format_size( + struct xfs_buf_log_format *blfp) +{ + return offsetof(struct xfs_buf_log_format, blf_data_map) + + (blfp->blf_map_size * sizeof(blfp->blf_data_map[0])); +} + +/* + * This returns the number of log iovecs needed to log the + * given buf log item. + * + * It calculates this as 1 iovec for the buf log format structure + * and 1 for each stretch of non-contiguous chunks to be logged. + * Contiguous chunks are logged in a single iovec. + * + * If the XFS_BLI_STALE flag has been set, then log nothing. + */ +STATIC void +xfs_buf_item_size_segment( + struct xfs_buf_log_item *bip, + struct xfs_buf_log_format *blfp, + int *nvecs, + int *nbytes) +{ + struct xfs_buf *bp = bip->bli_buf; + int next_bit; + int last_bit; + + last_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0); + if (last_bit == -1) + return; + + /* + * initial count for a dirty buffer is 2 vectors - the format structure + * and the first dirty region. + */ + *nvecs += 2; + *nbytes += xfs_buf_log_format_size(blfp) + XFS_BLF_CHUNK; + + while (last_bit != -1) { + /* + * This takes the bit number to start looking from and + * returns the next set bit from there. It returns -1 + * if there are no more bits set or the start bit is + * beyond the end of the bitmap. + */ + next_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, + last_bit + 1); + /* + * If we run out of bits, leave the loop, + * else if we find a new set of bits bump the number of vecs, + * else keep scanning the current set of bits. + */ + if (next_bit == -1) { + break; + } else if (next_bit != last_bit + 1) { + last_bit = next_bit; + (*nvecs)++; + } else if (xfs_buf_offset(bp, next_bit * XFS_BLF_CHUNK) != + (xfs_buf_offset(bp, last_bit * XFS_BLF_CHUNK) + + XFS_BLF_CHUNK)) { + last_bit = next_bit; + (*nvecs)++; + } else { + last_bit++; + } + *nbytes += XFS_BLF_CHUNK; + } +} + +/* + * This returns the number of log iovecs needed to log the given buf log item. + * + * It calculates this as 1 iovec for the buf log format structure and 1 for each + * stretch of non-contiguous chunks to be logged. Contiguous chunks are logged + * in a single iovec. + * + * Discontiguous buffers need a format structure per region that that is being + * logged. This makes the changes in the buffer appear to log recovery as though + * they came from separate buffers, just like would occur if multiple buffers + * were used instead of a single discontiguous buffer. This enables + * discontiguous buffers to be in-memory constructs, completely transparent to + * what ends up on disk. + * + * If the XFS_BLI_STALE flag has been set, then log nothing but the buf log + * format structures. + */ +STATIC void +xfs_buf_item_size( + struct xfs_log_item *lip, + int *nvecs, + int *nbytes) +{ + struct xfs_buf_log_item *bip = BUF_ITEM(lip); + int i; + + ASSERT(atomic_read(&bip->bli_refcount) > 0); + if (bip->bli_flags & XFS_BLI_STALE) { + /* + * The buffer is stale, so all we need to log + * is the buf log format structure with the + * cancel flag in it. + */ + trace_xfs_buf_item_size_stale(bip); + ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); + *nvecs += bip->bli_format_count; + for (i = 0; i < bip->bli_format_count; i++) { + *nbytes += xfs_buf_log_format_size(&bip->bli_formats[i]); + } + return; + } + + ASSERT(bip->bli_flags & XFS_BLI_LOGGED); + + if (bip->bli_flags & XFS_BLI_ORDERED) { + /* + * The buffer has been logged just to order it. + * It is not being included in the transaction + * commit, so no vectors are used at all. + */ + trace_xfs_buf_item_size_ordered(bip); + *nvecs = XFS_LOG_VEC_ORDERED; + return; + } + + /* + * the vector count is based on the number of buffer vectors we have + * dirty bits in. This will only be greater than one when we have a + * compound buffer with more than one segment dirty. Hence for compound + * buffers we need to track which segment the dirty bits correspond to, + * and when we move from one segment to the next increment the vector + * count for the extra buf log format structure that will need to be + * written. + */ + for (i = 0; i < bip->bli_format_count; i++) { + xfs_buf_item_size_segment(bip, &bip->bli_formats[i], + nvecs, nbytes); + } + trace_xfs_buf_item_size(bip); +} + +static inline void +xfs_buf_item_copy_iovec( + struct xfs_log_vec *lv, + struct xfs_log_iovec **vecp, + struct xfs_buf *bp, + uint offset, + int first_bit, + uint nbits) +{ + offset += first_bit * XFS_BLF_CHUNK; + xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_BCHUNK, + xfs_buf_offset(bp, offset), + nbits * XFS_BLF_CHUNK); +} + +static inline bool +xfs_buf_item_straddle( + struct xfs_buf *bp, + uint offset, + int next_bit, + int last_bit) +{ + return xfs_buf_offset(bp, offset + (next_bit << XFS_BLF_SHIFT)) != + (xfs_buf_offset(bp, offset + (last_bit << XFS_BLF_SHIFT)) + + XFS_BLF_CHUNK); +} + +static void +xfs_buf_item_format_segment( + struct xfs_buf_log_item *bip, + struct xfs_log_vec *lv, + struct xfs_log_iovec **vecp, + uint offset, + struct xfs_buf_log_format *blfp) +{ + struct xfs_buf *bp = bip->bli_buf; + uint base_size; + int first_bit; + int last_bit; + int next_bit; + uint nbits; + + /* copy the flags across from the base format item */ + blfp->blf_flags = bip->__bli_format.blf_flags; + + /* + * Base size is the actual size of the ondisk structure - it reflects + * the actual size of the dirty bitmap rather than the size of the in + * memory structure. + */ + base_size = xfs_buf_log_format_size(blfp); + + first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0); + if (!(bip->bli_flags & XFS_BLI_STALE) && first_bit == -1) { + /* + * If the map is not be dirty in the transaction, mark + * the size as zero and do not advance the vector pointer. + */ + return; + } + + blfp = xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_BFORMAT, blfp, base_size); + blfp->blf_size = 1; + + if (bip->bli_flags & XFS_BLI_STALE) { + /* + * The buffer is stale, so all we need to log + * is the buf log format structure with the + * cancel flag in it. + */ + trace_xfs_buf_item_format_stale(bip); + ASSERT(blfp->blf_flags & XFS_BLF_CANCEL); + return; + } + + + /* + * Fill in an iovec for each set of contiguous chunks. + */ + last_bit = first_bit; + nbits = 1; + for (;;) { + /* + * This takes the bit number to start looking from and + * returns the next set bit from there. It returns -1 + * if there are no more bits set or the start bit is + * beyond the end of the bitmap. + */ + next_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, + (uint)last_bit + 1); + /* + * If we run out of bits fill in the last iovec and get out of + * the loop. Else if we start a new set of bits then fill in + * the iovec for the series we were looking at and start + * counting the bits in the new one. Else we're still in the + * same set of bits so just keep counting and scanning. + */ + if (next_bit == -1) { + xfs_buf_item_copy_iovec(lv, vecp, bp, offset, + first_bit, nbits); + blfp->blf_size++; + break; + } else if (next_bit != last_bit + 1 || + xfs_buf_item_straddle(bp, offset, next_bit, last_bit)) { + xfs_buf_item_copy_iovec(lv, vecp, bp, offset, + first_bit, nbits); + blfp->blf_size++; + first_bit = next_bit; + last_bit = next_bit; + nbits = 1; + } else { + last_bit++; + nbits++; + } + } +} + +/* + * This is called to fill in the vector of log iovecs for the + * given log buf item. It fills the first entry with a buf log + * format structure, and the rest point to contiguous chunks + * within the buffer. + */ +STATIC void +xfs_buf_item_format( + struct xfs_log_item *lip, + struct xfs_log_vec *lv) +{ + struct xfs_buf_log_item *bip = BUF_ITEM(lip); + struct xfs_buf *bp = bip->bli_buf; + struct xfs_log_iovec *vecp = NULL; + uint offset = 0; + int i; + + ASSERT(atomic_read(&bip->bli_refcount) > 0); + ASSERT((bip->bli_flags & XFS_BLI_LOGGED) || + (bip->bli_flags & XFS_BLI_STALE)); + ASSERT((bip->bli_flags & XFS_BLI_STALE) || + (xfs_blft_from_flags(&bip->__bli_format) > XFS_BLFT_UNKNOWN_BUF + && xfs_blft_from_flags(&bip->__bli_format) < XFS_BLFT_MAX_BUF)); + + + /* + * If it is an inode buffer, transfer the in-memory state to the + * format flags and clear the in-memory state. + * + * For buffer based inode allocation, we do not transfer + * this state if the inode buffer allocation has not yet been committed + * to the log as setting the XFS_BLI_INODE_BUF flag will prevent + * correct replay of the inode allocation. + * + * For icreate item based inode allocation, the buffers aren't written + * to the journal during allocation, and hence we should always tag the + * buffer as an inode buffer so that the correct unlinked list replay + * occurs during recovery. + */ + if (bip->bli_flags & XFS_BLI_INODE_BUF) { + if (xfs_sb_version_hascrc(&lip->li_mountp->m_sb) || + !((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && + xfs_log_item_in_current_chkpt(lip))) + bip->__bli_format.blf_flags |= XFS_BLF_INODE_BUF; + bip->bli_flags &= ~XFS_BLI_INODE_BUF; + } + + if ((bip->bli_flags & (XFS_BLI_ORDERED|XFS_BLI_STALE)) == + XFS_BLI_ORDERED) { + /* + * The buffer has been logged just to order it. It is not being + * included in the transaction commit, so don't format it. + */ + trace_xfs_buf_item_format_ordered(bip); + return; + } + + for (i = 0; i < bip->bli_format_count; i++) { + xfs_buf_item_format_segment(bip, lv, &vecp, offset, + &bip->bli_formats[i]); + offset += bp->b_maps[i].bm_len; + } + + /* + * Check to make sure everything is consistent. + */ + trace_xfs_buf_item_format(bip); +} + +/* + * This is called to pin the buffer associated with the buf log item in memory + * so it cannot be written out. + * + * We also always take a reference to the buffer log item here so that the bli + * is held while the item is pinned in memory. This means that we can + * unconditionally drop the reference count a transaction holds when the + * transaction is completed. + */ +STATIC void +xfs_buf_item_pin( + struct xfs_log_item *lip) +{ + struct xfs_buf_log_item *bip = BUF_ITEM(lip); + + ASSERT(atomic_read(&bip->bli_refcount) > 0); + ASSERT((bip->bli_flags & XFS_BLI_LOGGED) || + (bip->bli_flags & XFS_BLI_ORDERED) || + (bip->bli_flags & XFS_BLI_STALE)); + + trace_xfs_buf_item_pin(bip); + + atomic_inc(&bip->bli_refcount); + atomic_inc(&bip->bli_buf->b_pin_count); +} + +/* + * This is called to unpin the buffer associated with the buf log + * item which was previously pinned with a call to xfs_buf_item_pin(). + * + * Also drop the reference to the buf item for the current transaction. + * If the XFS_BLI_STALE flag is set and we are the last reference, + * then free up the buf log item and unlock the buffer. + * + * If the remove flag is set we are called from uncommit in the + * forced-shutdown path. If that is true and the reference count on + * the log item is going to drop to zero we need to free the item's + * descriptor in the transaction. + */ +STATIC void +xfs_buf_item_unpin( + struct xfs_log_item *lip, + int remove) +{ + struct xfs_buf_log_item *bip = BUF_ITEM(lip); + xfs_buf_t *bp = bip->bli_buf; + struct xfs_ail *ailp = lip->li_ailp; + int stale = bip->bli_flags & XFS_BLI_STALE; + int freed; + + ASSERT(bp->b_fspriv == bip); + ASSERT(atomic_read(&bip->bli_refcount) > 0); + + trace_xfs_buf_item_unpin(bip); + + freed = atomic_dec_and_test(&bip->bli_refcount); + + if (atomic_dec_and_test(&bp->b_pin_count)) + wake_up_all(&bp->b_waiters); + + if (freed && stale) { + ASSERT(bip->bli_flags & XFS_BLI_STALE); + ASSERT(xfs_buf_islocked(bp)); + ASSERT(XFS_BUF_ISSTALE(bp)); + ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); + + trace_xfs_buf_item_unpin_stale(bip); + + if (remove) { + /* + * If we are in a transaction context, we have to + * remove the log item from the transaction as we are + * about to release our reference to the buffer. If we + * don't, the unlock that occurs later in + * xfs_trans_uncommit() will try to reference the + * buffer which we no longer have a hold on. + */ + if (lip->li_desc) + xfs_trans_del_item(lip); + + /* + * Since the transaction no longer refers to the buffer, + * the buffer should no longer refer to the transaction. + */ + bp->b_transp = NULL; + } + + /* + * If we get called here because of an IO error, we may + * or may not have the item on the AIL. xfs_trans_ail_delete() + * will take care of that situation. + * xfs_trans_ail_delete() drops the AIL lock. + */ + if (bip->bli_flags & XFS_BLI_STALE_INODE) { + xfs_buf_do_callbacks(bp); + bp->b_fspriv = NULL; + bp->b_iodone = NULL; + } else { + spin_lock(&ailp->xa_lock); + xfs_trans_ail_delete(ailp, lip, SHUTDOWN_LOG_IO_ERROR); + xfs_buf_item_relse(bp); + ASSERT(bp->b_fspriv == NULL); + } + xfs_buf_relse(bp); + } else if (freed && remove) { + /* + * There are currently two references to the buffer - the active + * LRU reference and the buf log item. What we are about to do + * here - simulate a failed IO completion - requires 3 + * references. + * + * The LRU reference is removed by the xfs_buf_stale() call. The + * buf item reference is removed by the xfs_buf_iodone() + * callback that is run by xfs_buf_do_callbacks() during ioend + * processing (via the bp->b_iodone callback), and then finally + * the ioend processing will drop the IO reference if the buffer + * is marked XBF_ASYNC. + * + * Hence we need to take an additional reference here so that IO + * completion processing doesn't free the buffer prematurely. + */ + xfs_buf_lock(bp); + xfs_buf_hold(bp); + bp->b_flags |= XBF_ASYNC; + xfs_buf_ioerror(bp, -EIO); + XFS_BUF_UNDONE(bp); + xfs_buf_stale(bp); + xfs_buf_ioend(bp); + } +} + +/* + * Buffer IO error rate limiting. Limit it to no more than 10 messages per 30 + * seconds so as to not spam logs too much on repeated detection of the same + * buffer being bad.. + */ + +static DEFINE_RATELIMIT_STATE(xfs_buf_write_fail_rl_state, 30 * HZ, 10); + +STATIC uint +xfs_buf_item_push( + struct xfs_log_item *lip, + struct list_head *buffer_list) +{ + struct xfs_buf_log_item *bip = BUF_ITEM(lip); + struct xfs_buf *bp = bip->bli_buf; + uint rval = XFS_ITEM_SUCCESS; + + if (xfs_buf_ispinned(bp)) + return XFS_ITEM_PINNED; + if (!xfs_buf_trylock(bp)) { + /* + * If we have just raced with a buffer being pinned and it has + * been marked stale, we could end up stalling until someone else + * issues a log force to unpin the stale buffer. Check for the + * race condition here so xfsaild recognizes the buffer is pinned + * and queues a log force to move it along. + */ + if (xfs_buf_ispinned(bp)) + return XFS_ITEM_PINNED; + return XFS_ITEM_LOCKED; + } + + ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); + + trace_xfs_buf_item_push(bip); + + /* has a previous flush failed due to IO errors? */ + if ((bp->b_flags & XBF_WRITE_FAIL) && + ___ratelimit(&xfs_buf_write_fail_rl_state, "XFS: Failing async write")) { + xfs_warn(bp->b_target->bt_mount, +"Failing async write on buffer block 0x%llx. Retrying async write.", + (long long)bp->b_bn); + } + + if (!xfs_buf_delwri_queue(bp, buffer_list)) + rval = XFS_ITEM_FLUSHING; + xfs_buf_unlock(bp); + return rval; +} + +/* + * Release the buffer associated with the buf log item. If there is no dirty + * logged data associated with the buffer recorded in the buf log item, then + * free the buf log item and remove the reference to it in the buffer. + * + * This call ignores the recursion count. It is only called when the buffer + * should REALLY be unlocked, regardless of the recursion count. + * + * We unconditionally drop the transaction's reference to the log item. If the + * item was logged, then another reference was taken when it was pinned, so we + * can safely drop the transaction reference now. This also allows us to avoid + * potential races with the unpin code freeing the bli by not referencing the + * bli after we've dropped the reference count. + * + * If the XFS_BLI_HOLD flag is set in the buf log item, then free the log item + * if necessary but do not unlock the buffer. This is for support of + * xfs_trans_bhold(). Make sure the XFS_BLI_HOLD field is cleared if we don't + * free the item. + */ +STATIC void +xfs_buf_item_unlock( + struct xfs_log_item *lip) +{ + struct xfs_buf_log_item *bip = BUF_ITEM(lip); + struct xfs_buf *bp = bip->bli_buf; + bool clean; + bool aborted; + int flags; + + /* Clear the buffer's association with this transaction. */ + bp->b_transp = NULL; + + /* + * If this is a transaction abort, don't return early. Instead, allow + * the brelse to happen. Normally it would be done for stale + * (cancelled) buffers at unpin time, but we'll never go through the + * pin/unpin cycle if we abort inside commit. + */ + aborted = (lip->li_flags & XFS_LI_ABORTED) ? true : false; + /* + * Before possibly freeing the buf item, copy the per-transaction state + * so we can reference it safely later after clearing it from the + * buffer log item. + */ + flags = bip->bli_flags; + bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD | XFS_BLI_ORDERED); + + /* + * If the buf item is marked stale, then don't do anything. We'll + * unlock the buffer and free the buf item when the buffer is unpinned + * for the last time. + */ + if (flags & XFS_BLI_STALE) { + trace_xfs_buf_item_unlock_stale(bip); + ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); + if (!aborted) { + atomic_dec(&bip->bli_refcount); + return; + } + } + + trace_xfs_buf_item_unlock(bip); + + /* + * If the buf item isn't tracking any data, free it, otherwise drop the + * reference we hold to it. If we are aborting the transaction, this may + * be the only reference to the buf item, so we free it anyway + * regardless of whether it is dirty or not. A dirty abort implies a + * shutdown, anyway. + * + * Ordered buffers are dirty but may have no recorded changes, so ensure + * we only release clean items here. + */ + clean = (flags & XFS_BLI_DIRTY) ? false : true; + if (clean) { + int i; + for (i = 0; i < bip->bli_format_count; i++) { + if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map, + bip->bli_formats[i].blf_map_size)) { + clean = false; + break; + } + } + } + + /* + * Clean buffers, by definition, cannot be in the AIL. However, aborted + * buffers may be dirty and hence in the AIL. Therefore if we are + * aborting a buffer and we've just taken the last refernce away, we + * have to check if it is in the AIL before freeing it. We need to free + * it in this case, because an aborted transaction has already shut the + * filesystem down and this is the last chance we will have to do so. + */ + if (atomic_dec_and_test(&bip->bli_refcount)) { + if (clean) + xfs_buf_item_relse(bp); + else if (aborted) { + ASSERT(XFS_FORCED_SHUTDOWN(lip->li_mountp)); + if (lip->li_flags & XFS_LI_IN_AIL) { + spin_lock(&lip->li_ailp->xa_lock); + xfs_trans_ail_delete(lip->li_ailp, lip, + SHUTDOWN_LOG_IO_ERROR); + } + xfs_buf_item_relse(bp); + } + } + + if (!(flags & XFS_BLI_HOLD)) + xfs_buf_relse(bp); +} + +/* + * This is called to find out where the oldest active copy of the + * buf log item in the on disk log resides now that the last log + * write of it completed at the given lsn. + * We always re-log all the dirty data in a buffer, so usually the + * latest copy in the on disk log is the only one that matters. For + * those cases we simply return the given lsn. + * + * The one exception to this is for buffers full of newly allocated + * inodes. These buffers are only relogged with the XFS_BLI_INODE_BUF + * flag set, indicating that only the di_next_unlinked fields from the + * inodes in the buffers will be replayed during recovery. If the + * original newly allocated inode images have not yet been flushed + * when the buffer is so relogged, then we need to make sure that we + * keep the old images in the 'active' portion of the log. We do this + * by returning the original lsn of that transaction here rather than + * the current one. + */ +STATIC xfs_lsn_t +xfs_buf_item_committed( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ + struct xfs_buf_log_item *bip = BUF_ITEM(lip); + + trace_xfs_buf_item_committed(bip); + + if ((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && lip->li_lsn != 0) + return lip->li_lsn; + return lsn; +} + +STATIC void +xfs_buf_item_committing( + struct xfs_log_item *lip, + xfs_lsn_t commit_lsn) +{ +} + +/* + * This is the ops vector shared by all buf log items. + */ +static const struct xfs_item_ops xfs_buf_item_ops = { + .iop_size = xfs_buf_item_size, + .iop_format = xfs_buf_item_format, + .iop_pin = xfs_buf_item_pin, + .iop_unpin = xfs_buf_item_unpin, + .iop_unlock = xfs_buf_item_unlock, + .iop_committed = xfs_buf_item_committed, + .iop_push = xfs_buf_item_push, + .iop_committing = xfs_buf_item_committing +}; + +STATIC int +xfs_buf_item_get_format( + struct xfs_buf_log_item *bip, + int count) +{ + ASSERT(bip->bli_formats == NULL); + bip->bli_format_count = count; + + if (count == 1) { + bip->bli_formats = &bip->__bli_format; + return 0; + } + + bip->bli_formats = kmem_zalloc(count * sizeof(struct xfs_buf_log_format), + KM_SLEEP); + if (!bip->bli_formats) + return -ENOMEM; + return 0; +} + +STATIC void +xfs_buf_item_free_format( + struct xfs_buf_log_item *bip) +{ + if (bip->bli_formats != &bip->__bli_format) { + kmem_free(bip->bli_formats); + bip->bli_formats = NULL; + } +} + +/* + * Allocate a new buf log item to go with the given buffer. + * Set the buffer's b_fsprivate field to point to the new + * buf log item. If there are other item's attached to the + * buffer (see xfs_buf_attach_iodone() below), then put the + * buf log item at the front. + */ +void +xfs_buf_item_init( + xfs_buf_t *bp, + xfs_mount_t *mp) +{ + xfs_log_item_t *lip = bp->b_fspriv; + xfs_buf_log_item_t *bip; + int chunks; + int map_size; + int error; + int i; + + /* + * Check to see if there is already a buf log item for + * this buffer. If there is, it is guaranteed to be + * the first. If we do already have one, there is + * nothing to do here so return. + */ + ASSERT(bp->b_target->bt_mount == mp); + if (lip != NULL && lip->li_type == XFS_LI_BUF) + return; + + bip = kmem_zone_zalloc(xfs_buf_item_zone, KM_SLEEP); + xfs_log_item_init(mp, &bip->bli_item, XFS_LI_BUF, &xfs_buf_item_ops); + bip->bli_buf = bp; + xfs_buf_hold(bp); + + /* + * chunks is the number of XFS_BLF_CHUNK size pieces the buffer + * can be divided into. Make sure not to truncate any pieces. + * map_size is the size of the bitmap needed to describe the + * chunks of the buffer. + * + * Discontiguous buffer support follows the layout of the underlying + * buffer. This makes the implementation as simple as possible. + */ + error = xfs_buf_item_get_format(bip, bp->b_map_count); + ASSERT(error == 0); + + for (i = 0; i < bip->bli_format_count; i++) { + chunks = DIV_ROUND_UP(BBTOB(bp->b_maps[i].bm_len), + XFS_BLF_CHUNK); + map_size = DIV_ROUND_UP(chunks, NBWORD); + + bip->bli_formats[i].blf_type = XFS_LI_BUF; + bip->bli_formats[i].blf_blkno = bp->b_maps[i].bm_bn; + bip->bli_formats[i].blf_len = bp->b_maps[i].bm_len; + bip->bli_formats[i].blf_map_size = map_size; + } + + /* + * Put the buf item into the list of items attached to the + * buffer at the front. + */ + if (bp->b_fspriv) + bip->bli_item.li_bio_list = bp->b_fspriv; + bp->b_fspriv = bip; +} + + +/* + * Mark bytes first through last inclusive as dirty in the buf + * item's bitmap. + */ +static void +xfs_buf_item_log_segment( + uint first, + uint last, + uint *map) +{ + uint first_bit; + uint last_bit; + uint bits_to_set; + uint bits_set; + uint word_num; + uint *wordp; + uint bit; + uint end_bit; + uint mask; + + /* + * Convert byte offsets to bit numbers. + */ + first_bit = first >> XFS_BLF_SHIFT; + last_bit = last >> XFS_BLF_SHIFT; + + /* + * Calculate the total number of bits to be set. + */ + bits_to_set = last_bit - first_bit + 1; + + /* + * Get a pointer to the first word in the bitmap + * to set a bit in. + */ + word_num = first_bit >> BIT_TO_WORD_SHIFT; + wordp = &map[word_num]; + + /* + * Calculate the starting bit in the first word. + */ + bit = first_bit & (uint)(NBWORD - 1); + + /* + * First set any bits in the first word of our range. + * If it starts at bit 0 of the word, it will be + * set below rather than here. That is what the variable + * bit tells us. The variable bits_set tracks the number + * of bits that have been set so far. End_bit is the number + * of the last bit to be set in this word plus one. + */ + if (bit) { + end_bit = MIN(bit + bits_to_set, (uint)NBWORD); + mask = ((1 << (end_bit - bit)) - 1) << bit; + *wordp |= mask; + wordp++; + bits_set = end_bit - bit; + } else { + bits_set = 0; + } + + /* + * Now set bits a whole word at a time that are between + * first_bit and last_bit. + */ + while ((bits_to_set - bits_set) >= NBWORD) { + *wordp |= 0xffffffff; + bits_set += NBWORD; + wordp++; + } + + /* + * Finally, set any bits left to be set in one last partial word. + */ + end_bit = bits_to_set - bits_set; + if (end_bit) { + mask = (1 << end_bit) - 1; + *wordp |= mask; + } +} + +/* + * Mark bytes first through last inclusive as dirty in the buf + * item's bitmap. + */ +void +xfs_buf_item_log( + xfs_buf_log_item_t *bip, + uint first, + uint last) +{ + int i; + uint start; + uint end; + struct xfs_buf *bp = bip->bli_buf; + + /* + * walk each buffer segment and mark them dirty appropriately. + */ + start = 0; + for (i = 0; i < bip->bli_format_count; i++) { + if (start > last) + break; + end = start + BBTOB(bp->b_maps[i].bm_len); + if (first > end) { + start += BBTOB(bp->b_maps[i].bm_len); + continue; + } + if (first < start) + first = start; + if (end > last) + end = last; + + xfs_buf_item_log_segment(first, end, + &bip->bli_formats[i].blf_data_map[0]); + + start += bp->b_maps[i].bm_len; + } +} + + +/* + * Return 1 if the buffer has been logged or ordered in a transaction (at any + * point, not just the current transaction) and 0 if not. + */ +uint +xfs_buf_item_dirty( + xfs_buf_log_item_t *bip) +{ + return (bip->bli_flags & XFS_BLI_DIRTY); +} + +STATIC void +xfs_buf_item_free( + xfs_buf_log_item_t *bip) +{ + xfs_buf_item_free_format(bip); + kmem_zone_free(xfs_buf_item_zone, bip); +} + +/* + * This is called when the buf log item is no longer needed. It should + * free the buf log item associated with the given buffer and clear + * the buffer's pointer to the buf log item. If there are no more + * items in the list, clear the b_iodone field of the buffer (see + * xfs_buf_attach_iodone() below). + */ +void +xfs_buf_item_relse( + xfs_buf_t *bp) +{ + xfs_buf_log_item_t *bip = bp->b_fspriv; + + trace_xfs_buf_item_relse(bp, _RET_IP_); + ASSERT(!(bip->bli_item.li_flags & XFS_LI_IN_AIL)); + + bp->b_fspriv = bip->bli_item.li_bio_list; + if (bp->b_fspriv == NULL) + bp->b_iodone = NULL; + + xfs_buf_rele(bp); + xfs_buf_item_free(bip); +} + + +/* + * Add the given log item with its callback to the list of callbacks + * to be called when the buffer's I/O completes. If it is not set + * already, set the buffer's b_iodone() routine to be + * xfs_buf_iodone_callbacks() and link the log item into the list of + * items rooted at b_fsprivate. Items are always added as the second + * entry in the list if there is a first, because the buf item code + * assumes that the buf log item is first. + */ +void +xfs_buf_attach_iodone( + xfs_buf_t *bp, + void (*cb)(xfs_buf_t *, xfs_log_item_t *), + xfs_log_item_t *lip) +{ + xfs_log_item_t *head_lip; + + ASSERT(xfs_buf_islocked(bp)); + + lip->li_cb = cb; + head_lip = bp->b_fspriv; + if (head_lip) { + lip->li_bio_list = head_lip->li_bio_list; + head_lip->li_bio_list = lip; + } else { + bp->b_fspriv = lip; + } + + ASSERT(bp->b_iodone == NULL || + bp->b_iodone == xfs_buf_iodone_callbacks); + bp->b_iodone = xfs_buf_iodone_callbacks; +} + +/* + * We can have many callbacks on a buffer. Running the callbacks individually + * can cause a lot of contention on the AIL lock, so we allow for a single + * callback to be able to scan the remaining lip->li_bio_list for other items + * of the same type and callback to be processed in the first call. + * + * As a result, the loop walking the callback list below will also modify the + * list. it removes the first item from the list and then runs the callback. + * The loop then restarts from the new head of the list. This allows the + * callback to scan and modify the list attached to the buffer and we don't + * have to care about maintaining a next item pointer. + */ +STATIC void +xfs_buf_do_callbacks( + struct xfs_buf *bp) +{ + struct xfs_log_item *lip; + + while ((lip = bp->b_fspriv) != NULL) { + bp->b_fspriv = lip->li_bio_list; + ASSERT(lip->li_cb != NULL); + /* + * Clear the next pointer so we don't have any + * confusion if the item is added to another buf. + * Don't touch the log item after calling its + * callback, because it could have freed itself. + */ + lip->li_bio_list = NULL; + lip->li_cb(bp, lip); + } +} + +/* + * This is the iodone() function for buffers which have had callbacks + * attached to them by xfs_buf_attach_iodone(). It should remove each + * log item from the buffer's list and call the callback of each in turn. + * When done, the buffer's fsprivate field is set to NULL and the buffer + * is unlocked with a call to iodone(). + */ +void +xfs_buf_iodone_callbacks( + struct xfs_buf *bp) +{ + struct xfs_log_item *lip = bp->b_fspriv; + struct xfs_mount *mp = lip->li_mountp; + static ulong lasttime; + static xfs_buftarg_t *lasttarg; + + if (likely(!bp->b_error)) + goto do_callbacks; + + /* + * If we've already decided to shutdown the filesystem because of + * I/O errors, there's no point in giving this a retry. + */ + if (XFS_FORCED_SHUTDOWN(mp)) { + xfs_buf_stale(bp); + XFS_BUF_DONE(bp); + trace_xfs_buf_item_iodone(bp, _RET_IP_); + goto do_callbacks; + } + + if (bp->b_target != lasttarg || + time_after(jiffies, (lasttime + 5*HZ))) { + lasttime = jiffies; + xfs_buf_ioerror_alert(bp, __func__); + } + lasttarg = bp->b_target; + + /* + * If the write was asynchronous then no one will be looking for the + * error. Clear the error state and write the buffer out again. + * + * XXX: This helps against transient write errors, but we need to find + * a way to shut the filesystem down if the writes keep failing. + * + * In practice we'll shut the filesystem down soon as non-transient + * errors tend to affect the whole device and a failing log write + * will make us give up. But we really ought to do better here. + */ + if (XFS_BUF_ISASYNC(bp)) { + ASSERT(bp->b_iodone != NULL); + + trace_xfs_buf_item_iodone_async(bp, _RET_IP_); + + xfs_buf_ioerror(bp, 0); /* errno of 0 unsets the flag */ + + if (!(bp->b_flags & (XBF_STALE|XBF_WRITE_FAIL))) { + bp->b_flags |= XBF_WRITE | XBF_ASYNC | + XBF_DONE | XBF_WRITE_FAIL; + xfs_buf_submit(bp); + } else { + xfs_buf_relse(bp); + } + + return; + } + + /* + * If the write of the buffer was synchronous, we want to make + * sure to return the error to the caller of xfs_bwrite(). + */ + xfs_buf_stale(bp); + XFS_BUF_DONE(bp); + + trace_xfs_buf_error_relse(bp, _RET_IP_); + +do_callbacks: + xfs_buf_do_callbacks(bp); + bp->b_fspriv = NULL; + bp->b_iodone = NULL; + xfs_buf_ioend(bp); +} + +/* + * This is the iodone() function for buffers which have been + * logged. It is called when they are eventually flushed out. + * It should remove the buf item from the AIL, and free the buf item. + * It is called by xfs_buf_iodone_callbacks() above which will take + * care of cleaning up the buffer itself. + */ +void +xfs_buf_iodone( + struct xfs_buf *bp, + struct xfs_log_item *lip) +{ + struct xfs_ail *ailp = lip->li_ailp; + + ASSERT(BUF_ITEM(lip)->bli_buf == bp); + + xfs_buf_rele(bp); + + /* + * If we are forcibly shutting down, this may well be + * off the AIL already. That's because we simulate the + * log-committed callbacks to unpin these buffers. Or we may never + * have put this item on AIL because of the transaction was + * aborted forcibly. xfs_trans_ail_delete() takes care of these. + * + * Either way, AIL is useless if we're forcing a shutdown. + */ + spin_lock(&ailp->xa_lock); + xfs_trans_ail_delete(ailp, lip, SHUTDOWN_CORRUPT_INCORE); + xfs_buf_item_free(BUF_ITEM(lip)); +} diff --git a/kernel/fs/xfs/xfs_buf_item.h b/kernel/fs/xfs/xfs_buf_item.h new file mode 100644 index 000000000..3f3455a41 --- /dev/null +++ b/kernel/fs/xfs/xfs_buf_item.h @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_BUF_ITEM_H__ +#define __XFS_BUF_ITEM_H__ + +/* kernel only definitions */ + +/* buf log item flags */ +#define XFS_BLI_HOLD 0x01 +#define XFS_BLI_DIRTY 0x02 +#define XFS_BLI_STALE 0x04 +#define XFS_BLI_LOGGED 0x08 +#define XFS_BLI_INODE_ALLOC_BUF 0x10 +#define XFS_BLI_STALE_INODE 0x20 +#define XFS_BLI_INODE_BUF 0x40 +#define XFS_BLI_ORDERED 0x80 + +#define XFS_BLI_FLAGS \ + { XFS_BLI_HOLD, "HOLD" }, \ + { XFS_BLI_DIRTY, "DIRTY" }, \ + { XFS_BLI_STALE, "STALE" }, \ + { XFS_BLI_LOGGED, "LOGGED" }, \ + { XFS_BLI_INODE_ALLOC_BUF, "INODE_ALLOC" }, \ + { XFS_BLI_STALE_INODE, "STALE_INODE" }, \ + { XFS_BLI_INODE_BUF, "INODE_BUF" }, \ + { XFS_BLI_ORDERED, "ORDERED" } + + +struct xfs_buf; +struct xfs_mount; +struct xfs_buf_log_item; + +/* + * This is the in core log item structure used to track information + * needed to log buffers. It tracks how many times the lock has been + * locked, and which 128 byte chunks of the buffer are dirty. + */ +typedef struct xfs_buf_log_item { + xfs_log_item_t bli_item; /* common item structure */ + struct xfs_buf *bli_buf; /* real buffer pointer */ + unsigned int bli_flags; /* misc flags */ + unsigned int bli_recur; /* lock recursion count */ + atomic_t bli_refcount; /* cnt of tp refs */ + int bli_format_count; /* count of headers */ + struct xfs_buf_log_format *bli_formats; /* array of in-log header ptrs */ + struct xfs_buf_log_format __bli_format; /* embedded in-log header */ +} xfs_buf_log_item_t; + +void xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *); +void xfs_buf_item_relse(struct xfs_buf *); +void xfs_buf_item_log(xfs_buf_log_item_t *, uint, uint); +uint xfs_buf_item_dirty(xfs_buf_log_item_t *); +void xfs_buf_attach_iodone(struct xfs_buf *, + void(*)(struct xfs_buf *, xfs_log_item_t *), + xfs_log_item_t *); +void xfs_buf_iodone_callbacks(struct xfs_buf *); +void xfs_buf_iodone(struct xfs_buf *, struct xfs_log_item *); + +extern kmem_zone_t *xfs_buf_item_zone; + +#endif /* __XFS_BUF_ITEM_H__ */ diff --git a/kernel/fs/xfs/xfs_dir2_readdir.c b/kernel/fs/xfs/xfs_dir2_readdir.c new file mode 100644 index 000000000..098cd78fe --- /dev/null +++ b/kernel/fs/xfs/xfs_dir2_readdir.c @@ -0,0 +1,681 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_inode.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" +#include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_bmap.h" +#include "xfs_trans.h" + +/* + * Directory file type support functions + */ +static unsigned char xfs_dir3_filetype_table[] = { + DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, + DT_FIFO, DT_SOCK, DT_LNK, DT_WHT, +}; + +static unsigned char +xfs_dir3_get_dtype( + struct xfs_mount *mp, + __uint8_t filetype) +{ + if (!xfs_sb_version_hasftype(&mp->m_sb)) + return DT_UNKNOWN; + + if (filetype >= XFS_DIR3_FT_MAX) + return DT_UNKNOWN; + + return xfs_dir3_filetype_table[filetype]; +} + +STATIC int +xfs_dir2_sf_getdents( + struct xfs_da_args *args, + struct dir_context *ctx) +{ + int i; /* shortform entry number */ + struct xfs_inode *dp = args->dp; /* incore directory inode */ + xfs_dir2_dataptr_t off; /* current entry's offset */ + xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */ + xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ + xfs_dir2_dataptr_t dot_offset; + xfs_dir2_dataptr_t dotdot_offset; + xfs_ino_t ino; + struct xfs_da_geometry *geo = args->geo; + + ASSERT(dp->i_df.if_flags & XFS_IFINLINE); + /* + * Give up if the directory is way too short. + */ + if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) { + ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount)); + return -EIO; + } + + ASSERT(dp->i_df.if_bytes == dp->i_d.di_size); + ASSERT(dp->i_df.if_u1.if_data != NULL); + + sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + + ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->i8count)); + + /* + * If the block number in the offset is out of range, we're done. + */ + if (xfs_dir2_dataptr_to_db(geo, ctx->pos) > geo->datablk) + return 0; + + /* + * Precalculate offsets for . and .. as we will always need them. + * + * XXX(hch): the second argument is sometimes 0 and sometimes + * geo->datablk + */ + dot_offset = xfs_dir2_db_off_to_dataptr(geo, geo->datablk, + dp->d_ops->data_dot_offset); + dotdot_offset = xfs_dir2_db_off_to_dataptr(geo, geo->datablk, + dp->d_ops->data_dotdot_offset); + + /* + * Put . entry unless we're starting past it. + */ + if (ctx->pos <= dot_offset) { + ctx->pos = dot_offset & 0x7fffffff; + if (!dir_emit(ctx, ".", 1, dp->i_ino, DT_DIR)) + return 0; + } + + /* + * Put .. entry unless we're starting past it. + */ + if (ctx->pos <= dotdot_offset) { + ino = dp->d_ops->sf_get_parent_ino(sfp); + ctx->pos = dotdot_offset & 0x7fffffff; + if (!dir_emit(ctx, "..", 2, ino, DT_DIR)) + return 0; + } + + /* + * Loop while there are more entries and put'ing works. + */ + sfep = xfs_dir2_sf_firstentry(sfp); + for (i = 0; i < sfp->count; i++) { + __uint8_t filetype; + + off = xfs_dir2_db_off_to_dataptr(geo, geo->datablk, + xfs_dir2_sf_get_offset(sfep)); + + if (ctx->pos > off) { + sfep = dp->d_ops->sf_nextentry(sfp, sfep); + continue; + } + + ino = dp->d_ops->sf_get_ino(sfp, sfep); + filetype = dp->d_ops->sf_get_ftype(sfep); + ctx->pos = off & 0x7fffffff; + if (!dir_emit(ctx, (char *)sfep->name, sfep->namelen, ino, + xfs_dir3_get_dtype(dp->i_mount, filetype))) + return 0; + sfep = dp->d_ops->sf_nextentry(sfp, sfep); + } + + ctx->pos = xfs_dir2_db_off_to_dataptr(geo, geo->datablk + 1, 0) & + 0x7fffffff; + return 0; +} + +/* + * Readdir for block directories. + */ +STATIC int +xfs_dir2_block_getdents( + struct xfs_da_args *args, + struct dir_context *ctx) +{ + struct xfs_inode *dp = args->dp; /* incore directory inode */ + xfs_dir2_data_hdr_t *hdr; /* block header */ + struct xfs_buf *bp; /* buffer for block */ + xfs_dir2_block_tail_t *btp; /* block tail */ + xfs_dir2_data_entry_t *dep; /* block data entry */ + xfs_dir2_data_unused_t *dup; /* block unused entry */ + char *endptr; /* end of the data entries */ + int error; /* error return value */ + char *ptr; /* current data entry */ + int wantoff; /* starting block offset */ + xfs_off_t cook; + struct xfs_da_geometry *geo = args->geo; + + /* + * If the block number in the offset is out of range, we're done. + */ + if (xfs_dir2_dataptr_to_db(geo, ctx->pos) > geo->datablk) + return 0; + + error = xfs_dir3_block_read(NULL, dp, &bp); + if (error) + return error; + + /* + * Extract the byte offset we start at from the seek pointer. + * We'll skip entries before this. + */ + wantoff = xfs_dir2_dataptr_to_off(geo, ctx->pos); + hdr = bp->b_addr; + xfs_dir3_data_check(dp, bp); + /* + * Set up values for the loop. + */ + btp = xfs_dir2_block_tail_p(geo, hdr); + ptr = (char *)dp->d_ops->data_entry_p(hdr); + endptr = (char *)xfs_dir2_block_leaf_p(btp); + + /* + * Loop over the data portion of the block. + * Each object is a real entry (dep) or an unused one (dup). + */ + while (ptr < endptr) { + __uint8_t filetype; + + dup = (xfs_dir2_data_unused_t *)ptr; + /* + * Unused, skip it. + */ + if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { + ptr += be16_to_cpu(dup->length); + continue; + } + + dep = (xfs_dir2_data_entry_t *)ptr; + + /* + * Bump pointer for the next iteration. + */ + ptr += dp->d_ops->data_entsize(dep->namelen); + /* + * The entry is before the desired starting point, skip it. + */ + if ((char *)dep - (char *)hdr < wantoff) + continue; + + cook = xfs_dir2_db_off_to_dataptr(geo, geo->datablk, + (char *)dep - (char *)hdr); + + ctx->pos = cook & 0x7fffffff; + filetype = dp->d_ops->data_get_ftype(dep); + /* + * If it didn't fit, set the final offset to here & return. + */ + if (!dir_emit(ctx, (char *)dep->name, dep->namelen, + be64_to_cpu(dep->inumber), + xfs_dir3_get_dtype(dp->i_mount, filetype))) { + xfs_trans_brelse(NULL, bp); + return 0; + } + } + + /* + * Reached the end of the block. + * Set the offset to a non-existent block 1 and return. + */ + ctx->pos = xfs_dir2_db_off_to_dataptr(geo, geo->datablk + 1, 0) & + 0x7fffffff; + xfs_trans_brelse(NULL, bp); + return 0; +} + +struct xfs_dir2_leaf_map_info { + xfs_extlen_t map_blocks; /* number of fsbs in map */ + xfs_dablk_t map_off; /* last mapped file offset */ + int map_size; /* total entries in *map */ + int map_valid; /* valid entries in *map */ + int nmap; /* mappings to ask xfs_bmapi */ + xfs_dir2_db_t curdb; /* db for current block */ + int ra_current; /* number of read-ahead blks */ + int ra_index; /* *map index for read-ahead */ + int ra_offset; /* map entry offset for ra */ + int ra_want; /* readahead count wanted */ + struct xfs_bmbt_irec map[]; /* map vector for blocks */ +}; + +STATIC int +xfs_dir2_leaf_readbuf( + struct xfs_da_args *args, + size_t bufsize, + struct xfs_dir2_leaf_map_info *mip, + xfs_dir2_off_t *curoff, + struct xfs_buf **bpp) +{ + struct xfs_inode *dp = args->dp; + struct xfs_buf *bp = *bpp; + struct xfs_bmbt_irec *map = mip->map; + struct blk_plug plug; + int error = 0; + int length; + int i; + int j; + struct xfs_da_geometry *geo = args->geo; + + /* + * If we have a buffer, we need to release it and + * take it out of the mapping. + */ + + if (bp) { + xfs_trans_brelse(NULL, bp); + bp = NULL; + mip->map_blocks -= geo->fsbcount; + /* + * Loop to get rid of the extents for the + * directory block. + */ + for (i = geo->fsbcount; i > 0; ) { + j = min_t(int, map->br_blockcount, i); + map->br_blockcount -= j; + map->br_startblock += j; + map->br_startoff += j; + /* + * If mapping is done, pitch it from + * the table. + */ + if (!map->br_blockcount && --mip->map_valid) + memmove(&map[0], &map[1], + sizeof(map[0]) * mip->map_valid); + i -= j; + } + } + + /* + * Recalculate the readahead blocks wanted. + */ + mip->ra_want = howmany(bufsize + geo->blksize, (1 << geo->fsblog)) - 1; + ASSERT(mip->ra_want >= 0); + + /* + * If we don't have as many as we want, and we haven't + * run out of data blocks, get some more mappings. + */ + if (1 + mip->ra_want > mip->map_blocks && + mip->map_off < xfs_dir2_byte_to_da(geo, XFS_DIR2_LEAF_OFFSET)) { + /* + * Get more bmaps, fill in after the ones + * we already have in the table. + */ + mip->nmap = mip->map_size - mip->map_valid; + error = xfs_bmapi_read(dp, mip->map_off, + xfs_dir2_byte_to_da(geo, XFS_DIR2_LEAF_OFFSET) - + mip->map_off, + &map[mip->map_valid], &mip->nmap, 0); + + /* + * Don't know if we should ignore this or try to return an + * error. The trouble with returning errors is that readdir + * will just stop without actually passing the error through. + */ + if (error) + goto out; /* XXX */ + + /* + * If we got all the mappings we asked for, set the final map + * offset based on the last bmap value received. Otherwise, + * we've reached the end. + */ + if (mip->nmap == mip->map_size - mip->map_valid) { + i = mip->map_valid + mip->nmap - 1; + mip->map_off = map[i].br_startoff + map[i].br_blockcount; + } else + mip->map_off = xfs_dir2_byte_to_da(geo, + XFS_DIR2_LEAF_OFFSET); + + /* + * Look for holes in the mapping, and eliminate them. Count up + * the valid blocks. + */ + for (i = mip->map_valid; i < mip->map_valid + mip->nmap; ) { + if (map[i].br_startblock == HOLESTARTBLOCK) { + mip->nmap--; + length = mip->map_valid + mip->nmap - i; + if (length) + memmove(&map[i], &map[i + 1], + sizeof(map[i]) * length); + } else { + mip->map_blocks += map[i].br_blockcount; + i++; + } + } + mip->map_valid += mip->nmap; + } + + /* + * No valid mappings, so no more data blocks. + */ + if (!mip->map_valid) { + *curoff = xfs_dir2_da_to_byte(geo, mip->map_off); + goto out; + } + + /* + * Read the directory block starting at the first mapping. + */ + mip->curdb = xfs_dir2_da_to_db(geo, map->br_startoff); + error = xfs_dir3_data_read(NULL, dp, map->br_startoff, + map->br_blockcount >= geo->fsbcount ? + XFS_FSB_TO_DADDR(dp->i_mount, map->br_startblock) : + -1, &bp); + /* + * Should just skip over the data block instead of giving up. + */ + if (error) + goto out; /* XXX */ + + /* + * Adjust the current amount of read-ahead: we just read a block that + * was previously ra. + */ + if (mip->ra_current) + mip->ra_current -= geo->fsbcount; + + /* + * Do we need more readahead? + */ + blk_start_plug(&plug); + for (mip->ra_index = mip->ra_offset = i = 0; + mip->ra_want > mip->ra_current && i < mip->map_blocks; + i += geo->fsbcount) { + ASSERT(mip->ra_index < mip->map_valid); + /* + * Read-ahead a contiguous directory block. + */ + if (i > mip->ra_current && + map[mip->ra_index].br_blockcount >= geo->fsbcount) { + xfs_dir3_data_readahead(dp, + map[mip->ra_index].br_startoff + mip->ra_offset, + XFS_FSB_TO_DADDR(dp->i_mount, + map[mip->ra_index].br_startblock + + mip->ra_offset)); + mip->ra_current = i; + } + + /* + * Read-ahead a non-contiguous directory block. This doesn't + * use our mapping, but this is a very rare case. + */ + else if (i > mip->ra_current) { + xfs_dir3_data_readahead(dp, + map[mip->ra_index].br_startoff + + mip->ra_offset, -1); + mip->ra_current = i; + } + + /* + * Advance offset through the mapping table. + */ + for (j = 0; j < geo->fsbcount; j += length ) { + /* + * The rest of this extent but not more than a dir + * block. + */ + length = min_t(int, geo->fsbcount, + map[mip->ra_index].br_blockcount - + mip->ra_offset); + mip->ra_offset += length; + + /* + * Advance to the next mapping if this one is used up. + */ + if (mip->ra_offset == map[mip->ra_index].br_blockcount) { + mip->ra_offset = 0; + mip->ra_index++; + } + } + } + blk_finish_plug(&plug); + +out: + *bpp = bp; + return error; +} + +/* + * Getdents (readdir) for leaf and node directories. + * This reads the data blocks only, so is the same for both forms. + */ +STATIC int +xfs_dir2_leaf_getdents( + struct xfs_da_args *args, + struct dir_context *ctx, + size_t bufsize) +{ + struct xfs_inode *dp = args->dp; + struct xfs_buf *bp = NULL; /* data block buffer */ + xfs_dir2_data_hdr_t *hdr; /* data block header */ + xfs_dir2_data_entry_t *dep; /* data entry */ + xfs_dir2_data_unused_t *dup; /* unused entry */ + int error = 0; /* error return value */ + int length; /* temporary length value */ + int byteoff; /* offset in current block */ + xfs_dir2_off_t curoff; /* current overall offset */ + xfs_dir2_off_t newoff; /* new curoff after new blk */ + char *ptr = NULL; /* pointer to current data */ + struct xfs_dir2_leaf_map_info *map_info; + struct xfs_da_geometry *geo = args->geo; + + /* + * If the offset is at or past the largest allowed value, + * give up right away. + */ + if (ctx->pos >= XFS_DIR2_MAX_DATAPTR) + return 0; + + /* + * Set up to bmap a number of blocks based on the caller's + * buffer size, the directory block size, and the filesystem + * block size. + */ + length = howmany(bufsize + geo->blksize, (1 << geo->fsblog)); + map_info = kmem_zalloc(offsetof(struct xfs_dir2_leaf_map_info, map) + + (length * sizeof(struct xfs_bmbt_irec)), + KM_SLEEP | KM_NOFS); + map_info->map_size = length; + + /* + * Inside the loop we keep the main offset value as a byte offset + * in the directory file. + */ + curoff = xfs_dir2_dataptr_to_byte(ctx->pos); + + /* + * Force this conversion through db so we truncate the offset + * down to get the start of the data block. + */ + map_info->map_off = xfs_dir2_db_to_da(geo, + xfs_dir2_byte_to_db(geo, curoff)); + + /* + * Loop over directory entries until we reach the end offset. + * Get more blocks and readahead as necessary. + */ + while (curoff < XFS_DIR2_LEAF_OFFSET) { + __uint8_t filetype; + + /* + * If we have no buffer, or we're off the end of the + * current buffer, need to get another one. + */ + if (!bp || ptr >= (char *)bp->b_addr + geo->blksize) { + + error = xfs_dir2_leaf_readbuf(args, bufsize, map_info, + &curoff, &bp); + if (error || !map_info->map_valid) + break; + + /* + * Having done a read, we need to set a new offset. + */ + newoff = xfs_dir2_db_off_to_byte(geo, + map_info->curdb, 0); + /* + * Start of the current block. + */ + if (curoff < newoff) + curoff = newoff; + /* + * Make sure we're in the right block. + */ + else if (curoff > newoff) + ASSERT(xfs_dir2_byte_to_db(geo, curoff) == + map_info->curdb); + hdr = bp->b_addr; + xfs_dir3_data_check(dp, bp); + /* + * Find our position in the block. + */ + ptr = (char *)dp->d_ops->data_entry_p(hdr); + byteoff = xfs_dir2_byte_to_off(geo, curoff); + /* + * Skip past the header. + */ + if (byteoff == 0) + curoff += dp->d_ops->data_entry_offset; + /* + * Skip past entries until we reach our offset. + */ + else { + while ((char *)ptr - (char *)hdr < byteoff) { + dup = (xfs_dir2_data_unused_t *)ptr; + + if (be16_to_cpu(dup->freetag) + == XFS_DIR2_DATA_FREE_TAG) { + + length = be16_to_cpu(dup->length); + ptr += length; + continue; + } + dep = (xfs_dir2_data_entry_t *)ptr; + length = + dp->d_ops->data_entsize(dep->namelen); + ptr += length; + } + /* + * Now set our real offset. + */ + curoff = + xfs_dir2_db_off_to_byte(geo, + xfs_dir2_byte_to_db(geo, curoff), + (char *)ptr - (char *)hdr); + if (ptr >= (char *)hdr + geo->blksize) { + continue; + } + } + } + /* + * We have a pointer to an entry. + * Is it a live one? + */ + dup = (xfs_dir2_data_unused_t *)ptr; + /* + * No, it's unused, skip over it. + */ + if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { + length = be16_to_cpu(dup->length); + ptr += length; + curoff += length; + continue; + } + + dep = (xfs_dir2_data_entry_t *)ptr; + length = dp->d_ops->data_entsize(dep->namelen); + filetype = dp->d_ops->data_get_ftype(dep); + + ctx->pos = xfs_dir2_byte_to_dataptr(curoff) & 0x7fffffff; + if (!dir_emit(ctx, (char *)dep->name, dep->namelen, + be64_to_cpu(dep->inumber), + xfs_dir3_get_dtype(dp->i_mount, filetype))) + break; + + /* + * Advance to next entry in the block. + */ + ptr += length; + curoff += length; + /* bufsize may have just been a guess; don't go negative */ + bufsize = bufsize > length ? bufsize - length : 0; + } + + /* + * All done. Set output offset value to current offset. + */ + if (curoff > xfs_dir2_dataptr_to_byte(XFS_DIR2_MAX_DATAPTR)) + ctx->pos = XFS_DIR2_MAX_DATAPTR & 0x7fffffff; + else + ctx->pos = xfs_dir2_byte_to_dataptr(curoff) & 0x7fffffff; + kmem_free(map_info); + if (bp) + xfs_trans_brelse(NULL, bp); + return error; +} + +/* + * Read a directory. + */ +int +xfs_readdir( + struct xfs_inode *dp, + struct dir_context *ctx, + size_t bufsize) +{ + struct xfs_da_args args = { NULL }; + int rval; + int v; + uint lock_mode; + + trace_xfs_readdir(dp); + + if (XFS_FORCED_SHUTDOWN(dp->i_mount)) + return -EIO; + + ASSERT(S_ISDIR(dp->i_d.di_mode)); + XFS_STATS_INC(xs_dir_getdents); + + args.dp = dp; + args.geo = dp->i_mount->m_dir_geo; + + lock_mode = xfs_ilock_data_map_shared(dp); + if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) + rval = xfs_dir2_sf_getdents(&args, ctx); + else if ((rval = xfs_dir2_isblock(&args, &v))) + ; + else if (v) + rval = xfs_dir2_block_getdents(&args, ctx); + else + rval = xfs_dir2_leaf_getdents(&args, ctx, bufsize); + xfs_iunlock(dp, lock_mode); + + return rval; +} diff --git a/kernel/fs/xfs/xfs_discard.c b/kernel/fs/xfs/xfs_discard.c new file mode 100644 index 000000000..e85a9519a --- /dev/null +++ b/kernel/fs/xfs/xfs_discard.c @@ -0,0 +1,239 @@ +/* + * Copyright (C) 2010 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_quota.h" +#include "xfs_inode.h" +#include "xfs_btree.h" +#include "xfs_alloc_btree.h" +#include "xfs_alloc.h" +#include "xfs_error.h" +#include "xfs_extent_busy.h" +#include "xfs_discard.h" +#include "xfs_trace.h" +#include "xfs_log.h" + +STATIC int +xfs_trim_extents( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_daddr_t start, + xfs_daddr_t end, + xfs_daddr_t minlen, + __uint64_t *blocks_trimmed) +{ + struct block_device *bdev = mp->m_ddev_targp->bt_bdev; + struct xfs_btree_cur *cur; + struct xfs_buf *agbp; + struct xfs_perag *pag; + int error; + int i; + + pag = xfs_perag_get(mp, agno); + + error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp); + if (error || !agbp) + goto out_put_perag; + + cur = xfs_allocbt_init_cursor(mp, NULL, agbp, agno, XFS_BTNUM_CNT); + + /* + * Force out the log. This means any transactions that might have freed + * space before we took the AGF buffer lock are now on disk, and the + * volatile disk cache is flushed. + */ + xfs_log_force(mp, XFS_LOG_SYNC); + + /* + * Look up the longest btree in the AGF and start with it. + */ + error = xfs_alloc_lookup_ge(cur, 0, + be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_longest), &i); + if (error) + goto out_del_cursor; + + /* + * Loop until we are done with all extents that are large + * enough to be worth discarding. + */ + while (i) { + xfs_agblock_t fbno; + xfs_extlen_t flen; + xfs_daddr_t dbno; + xfs_extlen_t dlen; + + error = xfs_alloc_get_rec(cur, &fbno, &flen, &i); + if (error) + goto out_del_cursor; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_del_cursor); + ASSERT(flen <= be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_longest)); + + /* + * use daddr format for all range/len calculations as that is + * the format the range/len variables are supplied in by + * userspace. + */ + dbno = XFS_AGB_TO_DADDR(mp, agno, fbno); + dlen = XFS_FSB_TO_BB(mp, flen); + + /* + * Too small? Give up. + */ + if (dlen < minlen) { + trace_xfs_discard_toosmall(mp, agno, fbno, flen); + goto out_del_cursor; + } + + /* + * If the extent is entirely outside of the range we are + * supposed to discard skip it. Do not bother to trim + * down partially overlapping ranges for now. + */ + if (dbno + dlen < start || dbno > end) { + trace_xfs_discard_exclude(mp, agno, fbno, flen); + goto next_extent; + } + + /* + * If any blocks in the range are still busy, skip the + * discard and try again the next time. + */ + if (xfs_extent_busy_search(mp, agno, fbno, flen)) { + trace_xfs_discard_busy(mp, agno, fbno, flen); + goto next_extent; + } + + trace_xfs_discard_extent(mp, agno, fbno, flen); + error = blkdev_issue_discard(bdev, dbno, dlen, GFP_NOFS, 0); + if (error) + goto out_del_cursor; + *blocks_trimmed += flen; + +next_extent: + error = xfs_btree_decrement(cur, 0, &i); + if (error) + goto out_del_cursor; + } + +out_del_cursor: + xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); + xfs_buf_relse(agbp); +out_put_perag: + xfs_perag_put(pag); + return error; +} + +/* + * trim a range of the filesystem. + * + * Note: the parameters passed from userspace are byte ranges into the + * filesystem which does not match to the format we use for filesystem block + * addressing. FSB addressing is sparse (AGNO|AGBNO), while the incoming format + * is a linear address range. Hence we need to use DADDR based conversions and + * comparisons for determining the correct offset and regions to trim. + */ +int +xfs_ioc_trim( + struct xfs_mount *mp, + struct fstrim_range __user *urange) +{ + struct request_queue *q = bdev_get_queue(mp->m_ddev_targp->bt_bdev); + unsigned int granularity = q->limits.discard_granularity; + struct fstrim_range range; + xfs_daddr_t start, end, minlen; + xfs_agnumber_t start_agno, end_agno, agno; + __uint64_t blocks_trimmed = 0; + int error, last_error = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (!blk_queue_discard(q)) + return -EOPNOTSUPP; + if (copy_from_user(&range, urange, sizeof(range))) + return -EFAULT; + + /* + * Truncating down the len isn't actually quite correct, but using + * BBTOB would mean we trivially get overflows for values + * of ULLONG_MAX or slightly lower. And ULLONG_MAX is the default + * used by the fstrim application. In the end it really doesn't + * matter as trimming blocks is an advisory interface. + */ + if (range.start >= XFS_FSB_TO_B(mp, mp->m_sb.sb_dblocks) || + range.minlen > XFS_FSB_TO_B(mp, XFS_ALLOC_AG_MAX_USABLE(mp)) || + range.len < mp->m_sb.sb_blocksize) + return -EINVAL; + + start = BTOBB(range.start); + end = start + BTOBBT(range.len) - 1; + minlen = BTOBB(max_t(u64, granularity, range.minlen)); + + if (end > XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) - 1) + end = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)- 1; + + start_agno = xfs_daddr_to_agno(mp, start); + end_agno = xfs_daddr_to_agno(mp, end); + + for (agno = start_agno; agno <= end_agno; agno++) { + error = xfs_trim_extents(mp, agno, start, end, minlen, + &blocks_trimmed); + if (error) + last_error = error; + } + + if (last_error) + return last_error; + + range.len = XFS_FSB_TO_B(mp, blocks_trimmed); + if (copy_to_user(urange, &range, sizeof(range))) + return -EFAULT; + return 0; +} + +int +xfs_discard_extents( + struct xfs_mount *mp, + struct list_head *list) +{ + struct xfs_extent_busy *busyp; + int error = 0; + + list_for_each_entry(busyp, list, list) { + trace_xfs_discard_extent(mp, busyp->agno, busyp->bno, + busyp->length); + + error = blkdev_issue_discard(mp->m_ddev_targp->bt_bdev, + XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno), + XFS_FSB_TO_BB(mp, busyp->length), + GFP_NOFS, 0); + if (error && error != -EOPNOTSUPP) { + xfs_info(mp, + "discard failed for extent [0x%llu,%u], error %d", + (unsigned long long)busyp->bno, + busyp->length, + error); + return error; + } + } + + return 0; +} diff --git a/kernel/fs/xfs/xfs_discard.h b/kernel/fs/xfs/xfs_discard.h new file mode 100644 index 000000000..344879aea --- /dev/null +++ b/kernel/fs/xfs/xfs_discard.h @@ -0,0 +1,10 @@ +#ifndef XFS_DISCARD_H +#define XFS_DISCARD_H 1 + +struct fstrim_range; +struct list_head; + +extern int xfs_ioc_trim(struct xfs_mount *, struct fstrim_range __user *); +extern int xfs_discard_extents(struct xfs_mount *, struct list_head *); + +#endif /* XFS_DISCARD_H */ diff --git a/kernel/fs/xfs/xfs_dquot.c b/kernel/fs/xfs/xfs_dquot.c new file mode 100644 index 000000000..02c01bbbc --- /dev/null +++ b/kernel/fs/xfs/xfs_dquot.c @@ -0,0 +1,1104 @@ +/* + * Copyright (c) 2000-2003 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_shared.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_bmap.h" +#include "xfs_bmap_util.h" +#include "xfs_alloc.h" +#include "xfs_quota.h" +#include "xfs_error.h" +#include "xfs_trans.h" +#include "xfs_buf_item.h" +#include "xfs_trans_space.h" +#include "xfs_trans_priv.h" +#include "xfs_qm.h" +#include "xfs_cksum.h" +#include "xfs_trace.h" +#include "xfs_log.h" +#include "xfs_bmap_btree.h" + +/* + * Lock order: + * + * ip->i_lock + * qi->qi_tree_lock + * dquot->q_qlock (xfs_dqlock() and friends) + * dquot->q_flush (xfs_dqflock() and friends) + * qi->qi_lru_lock + * + * If two dquots need to be locked the order is user before group/project, + * otherwise by the lowest id first, see xfs_dqlock2. + */ + +#ifdef DEBUG +xfs_buftarg_t *xfs_dqerror_target; +int xfs_do_dqerror; +int xfs_dqreq_num; +int xfs_dqerror_mod = 33; +#endif + +struct kmem_zone *xfs_qm_dqtrxzone; +static struct kmem_zone *xfs_qm_dqzone; + +static struct lock_class_key xfs_dquot_group_class; +static struct lock_class_key xfs_dquot_project_class; + +/* + * This is called to free all the memory associated with a dquot + */ +void +xfs_qm_dqdestroy( + xfs_dquot_t *dqp) +{ + ASSERT(list_empty(&dqp->q_lru)); + + mutex_destroy(&dqp->q_qlock); + kmem_zone_free(xfs_qm_dqzone, dqp); + + XFS_STATS_DEC(xs_qm_dquot); +} + +/* + * If default limits are in force, push them into the dquot now. + * We overwrite the dquot limits only if they are zero and this + * is not the root dquot. + */ +void +xfs_qm_adjust_dqlimits( + struct xfs_mount *mp, + struct xfs_dquot *dq) +{ + struct xfs_quotainfo *q = mp->m_quotainfo; + struct xfs_disk_dquot *d = &dq->q_core; + int prealloc = 0; + + ASSERT(d->d_id); + + if (q->qi_bsoftlimit && !d->d_blk_softlimit) { + d->d_blk_softlimit = cpu_to_be64(q->qi_bsoftlimit); + prealloc = 1; + } + if (q->qi_bhardlimit && !d->d_blk_hardlimit) { + d->d_blk_hardlimit = cpu_to_be64(q->qi_bhardlimit); + prealloc = 1; + } + if (q->qi_isoftlimit && !d->d_ino_softlimit) + d->d_ino_softlimit = cpu_to_be64(q->qi_isoftlimit); + if (q->qi_ihardlimit && !d->d_ino_hardlimit) + d->d_ino_hardlimit = cpu_to_be64(q->qi_ihardlimit); + if (q->qi_rtbsoftlimit && !d->d_rtb_softlimit) + d->d_rtb_softlimit = cpu_to_be64(q->qi_rtbsoftlimit); + if (q->qi_rtbhardlimit && !d->d_rtb_hardlimit) + d->d_rtb_hardlimit = cpu_to_be64(q->qi_rtbhardlimit); + + if (prealloc) + xfs_dquot_set_prealloc_limits(dq); +} + +/* + * Check the limits and timers of a dquot and start or reset timers + * if necessary. + * This gets called even when quota enforcement is OFF, which makes our + * life a little less complicated. (We just don't reject any quota + * reservations in that case, when enforcement is off). + * We also return 0 as the values of the timers in Q_GETQUOTA calls, when + * enforcement's off. + * In contrast, warnings are a little different in that they don't + * 'automatically' get started when limits get exceeded. They do + * get reset to zero, however, when we find the count to be under + * the soft limit (they are only ever set non-zero via userspace). + */ +void +xfs_qm_adjust_dqtimers( + xfs_mount_t *mp, + xfs_disk_dquot_t *d) +{ + ASSERT(d->d_id); + +#ifdef DEBUG + if (d->d_blk_hardlimit) + ASSERT(be64_to_cpu(d->d_blk_softlimit) <= + be64_to_cpu(d->d_blk_hardlimit)); + if (d->d_ino_hardlimit) + ASSERT(be64_to_cpu(d->d_ino_softlimit) <= + be64_to_cpu(d->d_ino_hardlimit)); + if (d->d_rtb_hardlimit) + ASSERT(be64_to_cpu(d->d_rtb_softlimit) <= + be64_to_cpu(d->d_rtb_hardlimit)); +#endif + + if (!d->d_btimer) { + if ((d->d_blk_softlimit && + (be64_to_cpu(d->d_bcount) > + be64_to_cpu(d->d_blk_softlimit))) || + (d->d_blk_hardlimit && + (be64_to_cpu(d->d_bcount) > + be64_to_cpu(d->d_blk_hardlimit)))) { + d->d_btimer = cpu_to_be32(get_seconds() + + mp->m_quotainfo->qi_btimelimit); + } else { + d->d_bwarns = 0; + } + } else { + if ((!d->d_blk_softlimit || + (be64_to_cpu(d->d_bcount) <= + be64_to_cpu(d->d_blk_softlimit))) && + (!d->d_blk_hardlimit || + (be64_to_cpu(d->d_bcount) <= + be64_to_cpu(d->d_blk_hardlimit)))) { + d->d_btimer = 0; + } + } + + if (!d->d_itimer) { + if ((d->d_ino_softlimit && + (be64_to_cpu(d->d_icount) > + be64_to_cpu(d->d_ino_softlimit))) || + (d->d_ino_hardlimit && + (be64_to_cpu(d->d_icount) > + be64_to_cpu(d->d_ino_hardlimit)))) { + d->d_itimer = cpu_to_be32(get_seconds() + + mp->m_quotainfo->qi_itimelimit); + } else { + d->d_iwarns = 0; + } + } else { + if ((!d->d_ino_softlimit || + (be64_to_cpu(d->d_icount) <= + be64_to_cpu(d->d_ino_softlimit))) && + (!d->d_ino_hardlimit || + (be64_to_cpu(d->d_icount) <= + be64_to_cpu(d->d_ino_hardlimit)))) { + d->d_itimer = 0; + } + } + + if (!d->d_rtbtimer) { + if ((d->d_rtb_softlimit && + (be64_to_cpu(d->d_rtbcount) > + be64_to_cpu(d->d_rtb_softlimit))) || + (d->d_rtb_hardlimit && + (be64_to_cpu(d->d_rtbcount) > + be64_to_cpu(d->d_rtb_hardlimit)))) { + d->d_rtbtimer = cpu_to_be32(get_seconds() + + mp->m_quotainfo->qi_rtbtimelimit); + } else { + d->d_rtbwarns = 0; + } + } else { + if ((!d->d_rtb_softlimit || + (be64_to_cpu(d->d_rtbcount) <= + be64_to_cpu(d->d_rtb_softlimit))) && + (!d->d_rtb_hardlimit || + (be64_to_cpu(d->d_rtbcount) <= + be64_to_cpu(d->d_rtb_hardlimit)))) { + d->d_rtbtimer = 0; + } + } +} + +/* + * initialize a buffer full of dquots and log the whole thing + */ +STATIC void +xfs_qm_init_dquot_blk( + xfs_trans_t *tp, + xfs_mount_t *mp, + xfs_dqid_t id, + uint type, + xfs_buf_t *bp) +{ + struct xfs_quotainfo *q = mp->m_quotainfo; + xfs_dqblk_t *d; + int curid, i; + + ASSERT(tp); + ASSERT(xfs_buf_islocked(bp)); + + d = bp->b_addr; + + /* + * ID of the first dquot in the block - id's are zero based. + */ + curid = id - (id % q->qi_dqperchunk); + ASSERT(curid >= 0); + memset(d, 0, BBTOB(q->qi_dqchunklen)); + for (i = 0; i < q->qi_dqperchunk; i++, d++, curid++) { + d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC); + d->dd_diskdq.d_version = XFS_DQUOT_VERSION; + d->dd_diskdq.d_id = cpu_to_be32(curid); + d->dd_diskdq.d_flags = type; + if (xfs_sb_version_hascrc(&mp->m_sb)) { + uuid_copy(&d->dd_uuid, &mp->m_sb.sb_uuid); + xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk), + XFS_DQUOT_CRC_OFF); + } + } + + xfs_trans_dquot_buf(tp, bp, + (type & XFS_DQ_USER ? XFS_BLF_UDQUOT_BUF : + ((type & XFS_DQ_PROJ) ? XFS_BLF_PDQUOT_BUF : + XFS_BLF_GDQUOT_BUF))); + xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1); +} + +/* + * Initialize the dynamic speculative preallocation thresholds. The lo/hi + * watermarks correspond to the soft and hard limits by default. If a soft limit + * is not specified, we use 95% of the hard limit. + */ +void +xfs_dquot_set_prealloc_limits(struct xfs_dquot *dqp) +{ + __uint64_t space; + + dqp->q_prealloc_hi_wmark = be64_to_cpu(dqp->q_core.d_blk_hardlimit); + dqp->q_prealloc_lo_wmark = be64_to_cpu(dqp->q_core.d_blk_softlimit); + if (!dqp->q_prealloc_lo_wmark) { + dqp->q_prealloc_lo_wmark = dqp->q_prealloc_hi_wmark; + do_div(dqp->q_prealloc_lo_wmark, 100); + dqp->q_prealloc_lo_wmark *= 95; + } + + space = dqp->q_prealloc_hi_wmark; + + do_div(space, 100); + dqp->q_low_space[XFS_QLOWSP_1_PCNT] = space; + dqp->q_low_space[XFS_QLOWSP_3_PCNT] = space * 3; + dqp->q_low_space[XFS_QLOWSP_5_PCNT] = space * 5; +} + +/* + * Allocate a block and fill it with dquots. + * This is called when the bmapi finds a hole. + */ +STATIC int +xfs_qm_dqalloc( + xfs_trans_t **tpp, + xfs_mount_t *mp, + xfs_dquot_t *dqp, + xfs_inode_t *quotip, + xfs_fileoff_t offset_fsb, + xfs_buf_t **O_bpp) +{ + xfs_fsblock_t firstblock; + xfs_bmap_free_t flist; + xfs_bmbt_irec_t map; + int nmaps, error, committed; + xfs_buf_t *bp; + xfs_trans_t *tp = *tpp; + + ASSERT(tp != NULL); + + trace_xfs_dqalloc(dqp); + + /* + * Initialize the bmap freelist prior to calling bmapi code. + */ + xfs_bmap_init(&flist, &firstblock); + xfs_ilock(quotip, XFS_ILOCK_EXCL); + /* + * Return if this type of quotas is turned off while we didn't + * have an inode lock + */ + if (!xfs_this_quota_on(dqp->q_mount, dqp->dq_flags)) { + xfs_iunlock(quotip, XFS_ILOCK_EXCL); + return -ESRCH; + } + + xfs_trans_ijoin(tp, quotip, XFS_ILOCK_EXCL); + nmaps = 1; + error = xfs_bmapi_write(tp, quotip, offset_fsb, + XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA, + &firstblock, XFS_QM_DQALLOC_SPACE_RES(mp), + &map, &nmaps, &flist); + if (error) + goto error0; + ASSERT(map.br_blockcount == XFS_DQUOT_CLUSTER_SIZE_FSB); + ASSERT(nmaps == 1); + ASSERT((map.br_startblock != DELAYSTARTBLOCK) && + (map.br_startblock != HOLESTARTBLOCK)); + + /* + * Keep track of the blkno to save a lookup later + */ + dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock); + + /* now we can just get the buffer (there's nothing to read yet) */ + bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, + dqp->q_blkno, + mp->m_quotainfo->qi_dqchunklen, + 0); + if (!bp) { + error = -ENOMEM; + goto error1; + } + bp->b_ops = &xfs_dquot_buf_ops; + + /* + * Make a chunk of dquots out of this buffer and log + * the entire thing. + */ + xfs_qm_init_dquot_blk(tp, mp, be32_to_cpu(dqp->q_core.d_id), + dqp->dq_flags & XFS_DQ_ALLTYPES, bp); + + /* + * xfs_bmap_finish() may commit the current transaction and + * start a second transaction if the freelist is not empty. + * + * Since we still want to modify this buffer, we need to + * ensure that the buffer is not released on commit of + * the first transaction and ensure the buffer is added to the + * second transaction. + * + * If there is only one transaction then don't stop the buffer + * from being released when it commits later on. + */ + + xfs_trans_bhold(tp, bp); + + if ((error = xfs_bmap_finish(tpp, &flist, &committed))) { + goto error1; + } + + if (committed) { + tp = *tpp; + xfs_trans_bjoin(tp, bp); + } else { + xfs_trans_bhold_release(tp, bp); + } + + *O_bpp = bp; + return 0; + + error1: + xfs_bmap_cancel(&flist); + error0: + xfs_iunlock(quotip, XFS_ILOCK_EXCL); + + return error; +} + +STATIC int +xfs_qm_dqrepair( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_dquot *dqp, + xfs_dqid_t firstid, + struct xfs_buf **bpp) +{ + int error; + struct xfs_disk_dquot *ddq; + struct xfs_dqblk *d; + int i; + + /* + * Read the buffer without verification so we get the corrupted + * buffer returned to us. make sure we verify it on write, though. + */ + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, dqp->q_blkno, + mp->m_quotainfo->qi_dqchunklen, + 0, bpp, NULL); + + if (error) { + ASSERT(*bpp == NULL); + return error; + } + (*bpp)->b_ops = &xfs_dquot_buf_ops; + + ASSERT(xfs_buf_islocked(*bpp)); + d = (struct xfs_dqblk *)(*bpp)->b_addr; + + /* Do the actual repair of dquots in this buffer */ + for (i = 0; i < mp->m_quotainfo->qi_dqperchunk; i++) { + ddq = &d[i].dd_diskdq; + error = xfs_dqcheck(mp, ddq, firstid + i, + dqp->dq_flags & XFS_DQ_ALLTYPES, + XFS_QMOPT_DQREPAIR, "xfs_qm_dqrepair"); + if (error) { + /* repair failed, we're screwed */ + xfs_trans_brelse(tp, *bpp); + return -EIO; + } + } + + return 0; +} + +/* + * Maps a dquot to the buffer containing its on-disk version. + * This returns a ptr to the buffer containing the on-disk dquot + * in the bpp param, and a ptr to the on-disk dquot within that buffer + */ +STATIC int +xfs_qm_dqtobp( + xfs_trans_t **tpp, + xfs_dquot_t *dqp, + xfs_disk_dquot_t **O_ddpp, + xfs_buf_t **O_bpp, + uint flags) +{ + struct xfs_bmbt_irec map; + int nmaps = 1, error; + struct xfs_buf *bp; + struct xfs_inode *quotip = xfs_dq_to_quota_inode(dqp); + struct xfs_mount *mp = dqp->q_mount; + xfs_dqid_t id = be32_to_cpu(dqp->q_core.d_id); + struct xfs_trans *tp = (tpp ? *tpp : NULL); + uint lock_mode; + + dqp->q_fileoffset = (xfs_fileoff_t)id / mp->m_quotainfo->qi_dqperchunk; + + lock_mode = xfs_ilock_data_map_shared(quotip); + if (!xfs_this_quota_on(dqp->q_mount, dqp->dq_flags)) { + /* + * Return if this type of quotas is turned off while we + * didn't have the quota inode lock. + */ + xfs_iunlock(quotip, lock_mode); + return -ESRCH; + } + + /* + * Find the block map; no allocations yet + */ + error = xfs_bmapi_read(quotip, dqp->q_fileoffset, + XFS_DQUOT_CLUSTER_SIZE_FSB, &map, &nmaps, 0); + + xfs_iunlock(quotip, lock_mode); + if (error) + return error; + + ASSERT(nmaps == 1); + ASSERT(map.br_blockcount == 1); + + /* + * Offset of dquot in the (fixed sized) dquot chunk. + */ + dqp->q_bufoffset = (id % mp->m_quotainfo->qi_dqperchunk) * + sizeof(xfs_dqblk_t); + + ASSERT(map.br_startblock != DELAYSTARTBLOCK); + if (map.br_startblock == HOLESTARTBLOCK) { + /* + * We don't allocate unless we're asked to + */ + if (!(flags & XFS_QMOPT_DQALLOC)) + return -ENOENT; + + ASSERT(tp); + error = xfs_qm_dqalloc(tpp, mp, dqp, quotip, + dqp->q_fileoffset, &bp); + if (error) + return error; + tp = *tpp; + } else { + trace_xfs_dqtobp_read(dqp); + + /* + * store the blkno etc so that we don't have to do the + * mapping all the time + */ + dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock); + + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, + dqp->q_blkno, + mp->m_quotainfo->qi_dqchunklen, + 0, &bp, &xfs_dquot_buf_ops); + + if (error == -EFSCORRUPTED && (flags & XFS_QMOPT_DQREPAIR)) { + xfs_dqid_t firstid = (xfs_dqid_t)map.br_startoff * + mp->m_quotainfo->qi_dqperchunk; + ASSERT(bp == NULL); + error = xfs_qm_dqrepair(mp, tp, dqp, firstid, &bp); + } + + if (error) { + ASSERT(bp == NULL); + return error; + } + } + + ASSERT(xfs_buf_islocked(bp)); + *O_bpp = bp; + *O_ddpp = bp->b_addr + dqp->q_bufoffset; + + return 0; +} + + +/* + * Read in the ondisk dquot using dqtobp() then copy it to an incore version, + * and release the buffer immediately. + * + * If XFS_QMOPT_DQALLOC is set, allocate a dquot on disk if it needed. + */ +int +xfs_qm_dqread( + struct xfs_mount *mp, + xfs_dqid_t id, + uint type, + uint flags, + struct xfs_dquot **O_dqpp) +{ + struct xfs_dquot *dqp; + struct xfs_disk_dquot *ddqp; + struct xfs_buf *bp; + struct xfs_trans *tp = NULL; + int error; + int cancelflags = 0; + + + dqp = kmem_zone_zalloc(xfs_qm_dqzone, KM_SLEEP); + + dqp->dq_flags = type; + dqp->q_core.d_id = cpu_to_be32(id); + dqp->q_mount = mp; + INIT_LIST_HEAD(&dqp->q_lru); + mutex_init(&dqp->q_qlock); + init_waitqueue_head(&dqp->q_pinwait); + + /* + * Because we want to use a counting completion, complete + * the flush completion once to allow a single access to + * the flush completion without blocking. + */ + init_completion(&dqp->q_flush); + complete(&dqp->q_flush); + + /* + * Make sure group quotas have a different lock class than user + * quotas. + */ + switch (type) { + case XFS_DQ_USER: + /* uses the default lock class */ + break; + case XFS_DQ_GROUP: + lockdep_set_class(&dqp->q_qlock, &xfs_dquot_group_class); + break; + case XFS_DQ_PROJ: + lockdep_set_class(&dqp->q_qlock, &xfs_dquot_project_class); + break; + default: + ASSERT(0); + break; + } + + XFS_STATS_INC(xs_qm_dquot); + + trace_xfs_dqread(dqp); + + if (flags & XFS_QMOPT_DQALLOC) { + tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_dqalloc, + XFS_QM_DQALLOC_SPACE_RES(mp), 0); + if (error) + goto error1; + cancelflags = XFS_TRANS_RELEASE_LOG_RES; + } + + /* + * get a pointer to the on-disk dquot and the buffer containing it + * dqp already knows its own type (GROUP/USER). + */ + error = xfs_qm_dqtobp(&tp, dqp, &ddqp, &bp, flags); + if (error) { + /* + * This can happen if quotas got turned off (ESRCH), + * or if the dquot didn't exist on disk and we ask to + * allocate (ENOENT). + */ + trace_xfs_dqread_fail(dqp); + cancelflags |= XFS_TRANS_ABORT; + goto error1; + } + + /* copy everything from disk dquot to the incore dquot */ + memcpy(&dqp->q_core, ddqp, sizeof(xfs_disk_dquot_t)); + xfs_qm_dquot_logitem_init(dqp); + + /* + * Reservation counters are defined as reservation plus current usage + * to avoid having to add every time. + */ + dqp->q_res_bcount = be64_to_cpu(ddqp->d_bcount); + dqp->q_res_icount = be64_to_cpu(ddqp->d_icount); + dqp->q_res_rtbcount = be64_to_cpu(ddqp->d_rtbcount); + + /* initialize the dquot speculative prealloc thresholds */ + xfs_dquot_set_prealloc_limits(dqp); + + /* Mark the buf so that this will stay incore a little longer */ + xfs_buf_set_ref(bp, XFS_DQUOT_REF); + + /* + * We got the buffer with a xfs_trans_read_buf() (in dqtobp()) + * So we need to release with xfs_trans_brelse(). + * The strategy here is identical to that of inodes; we lock + * the dquot in xfs_qm_dqget() before making it accessible to + * others. This is because dquots, like inodes, need a good level of + * concurrency, and we don't want to take locks on the entire buffers + * for dquot accesses. + * Note also that the dquot buffer may even be dirty at this point, if + * this particular dquot was repaired. We still aren't afraid to + * brelse it because we have the changes incore. + */ + ASSERT(xfs_buf_islocked(bp)); + xfs_trans_brelse(tp, bp); + + if (tp) { + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + if (error) + goto error0; + } + + *O_dqpp = dqp; + return error; + +error1: + if (tp) + xfs_trans_cancel(tp, cancelflags); +error0: + xfs_qm_dqdestroy(dqp); + *O_dqpp = NULL; + return error; +} + +/* + * Given the file system, inode OR id, and type (UDQUOT/GDQUOT), return a + * a locked dquot, doing an allocation (if requested) as needed. + * When both an inode and an id are given, the inode's id takes precedence. + * That is, if the id changes while we don't hold the ilock inside this + * function, the new dquot is returned, not necessarily the one requested + * in the id argument. + */ +int +xfs_qm_dqget( + xfs_mount_t *mp, + xfs_inode_t *ip, /* locked inode (optional) */ + xfs_dqid_t id, /* uid/projid/gid depending on type */ + uint type, /* XFS_DQ_USER/XFS_DQ_PROJ/XFS_DQ_GROUP */ + uint flags, /* DQALLOC, DQSUSER, DQREPAIR, DOWARN */ + xfs_dquot_t **O_dqpp) /* OUT : locked incore dquot */ +{ + struct xfs_quotainfo *qi = mp->m_quotainfo; + struct radix_tree_root *tree = xfs_dquot_tree(qi, type); + struct xfs_dquot *dqp; + int error; + + ASSERT(XFS_IS_QUOTA_RUNNING(mp)); + if ((! XFS_IS_UQUOTA_ON(mp) && type == XFS_DQ_USER) || + (! XFS_IS_PQUOTA_ON(mp) && type == XFS_DQ_PROJ) || + (! XFS_IS_GQUOTA_ON(mp) && type == XFS_DQ_GROUP)) { + return -ESRCH; + } + +#ifdef DEBUG + if (xfs_do_dqerror) { + if ((xfs_dqerror_target == mp->m_ddev_targp) && + (xfs_dqreq_num++ % xfs_dqerror_mod) == 0) { + xfs_debug(mp, "Returning error in dqget"); + return -EIO; + } + } + + ASSERT(type == XFS_DQ_USER || + type == XFS_DQ_PROJ || + type == XFS_DQ_GROUP); + if (ip) { + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + ASSERT(xfs_inode_dquot(ip, type) == NULL); + } +#endif + +restart: + mutex_lock(&qi->qi_tree_lock); + dqp = radix_tree_lookup(tree, id); + if (dqp) { + xfs_dqlock(dqp); + if (dqp->dq_flags & XFS_DQ_FREEING) { + xfs_dqunlock(dqp); + mutex_unlock(&qi->qi_tree_lock); + trace_xfs_dqget_freeing(dqp); + delay(1); + goto restart; + } + + dqp->q_nrefs++; + mutex_unlock(&qi->qi_tree_lock); + + trace_xfs_dqget_hit(dqp); + XFS_STATS_INC(xs_qm_dqcachehits); + *O_dqpp = dqp; + return 0; + } + mutex_unlock(&qi->qi_tree_lock); + XFS_STATS_INC(xs_qm_dqcachemisses); + + /* + * Dquot cache miss. We don't want to keep the inode lock across + * a (potential) disk read. Also we don't want to deal with the lock + * ordering between quotainode and this inode. OTOH, dropping the inode + * lock here means dealing with a chown that can happen before + * we re-acquire the lock. + */ + if (ip) + xfs_iunlock(ip, XFS_ILOCK_EXCL); + + error = xfs_qm_dqread(mp, id, type, flags, &dqp); + + if (ip) + xfs_ilock(ip, XFS_ILOCK_EXCL); + + if (error) + return error; + + if (ip) { + /* + * A dquot could be attached to this inode by now, since + * we had dropped the ilock. + */ + if (xfs_this_quota_on(mp, type)) { + struct xfs_dquot *dqp1; + + dqp1 = xfs_inode_dquot(ip, type); + if (dqp1) { + xfs_qm_dqdestroy(dqp); + dqp = dqp1; + xfs_dqlock(dqp); + goto dqret; + } + } else { + /* inode stays locked on return */ + xfs_qm_dqdestroy(dqp); + return -ESRCH; + } + } + + mutex_lock(&qi->qi_tree_lock); + error = radix_tree_insert(tree, id, dqp); + if (unlikely(error)) { + WARN_ON(error != -EEXIST); + + /* + * Duplicate found. Just throw away the new dquot and start + * over. + */ + mutex_unlock(&qi->qi_tree_lock); + trace_xfs_dqget_dup(dqp); + xfs_qm_dqdestroy(dqp); + XFS_STATS_INC(xs_qm_dquot_dups); + goto restart; + } + + /* + * We return a locked dquot to the caller, with a reference taken + */ + xfs_dqlock(dqp); + dqp->q_nrefs = 1; + + qi->qi_dquots++; + mutex_unlock(&qi->qi_tree_lock); + + dqret: + ASSERT((ip == NULL) || xfs_isilocked(ip, XFS_ILOCK_EXCL)); + trace_xfs_dqget_miss(dqp); + *O_dqpp = dqp; + return 0; +} + +/* + * Release a reference to the dquot (decrement ref-count) and unlock it. + * + * If there is a group quota attached to this dquot, carefully release that + * too without tripping over deadlocks'n'stuff. + */ +void +xfs_qm_dqput( + struct xfs_dquot *dqp) +{ + ASSERT(dqp->q_nrefs > 0); + ASSERT(XFS_DQ_IS_LOCKED(dqp)); + + trace_xfs_dqput(dqp); + + if (--dqp->q_nrefs == 0) { + struct xfs_quotainfo *qi = dqp->q_mount->m_quotainfo; + trace_xfs_dqput_free(dqp); + + if (list_lru_add(&qi->qi_lru, &dqp->q_lru)) + XFS_STATS_INC(xs_qm_dquot_unused); + } + xfs_dqunlock(dqp); +} + +/* + * Release a dquot. Flush it if dirty, then dqput() it. + * dquot must not be locked. + */ +void +xfs_qm_dqrele( + xfs_dquot_t *dqp) +{ + if (!dqp) + return; + + trace_xfs_dqrele(dqp); + + xfs_dqlock(dqp); + /* + * We don't care to flush it if the dquot is dirty here. + * That will create stutters that we want to avoid. + * Instead we do a delayed write when we try to reclaim + * a dirty dquot. Also xfs_sync will take part of the burden... + */ + xfs_qm_dqput(dqp); +} + +/* + * This is the dquot flushing I/O completion routine. It is called + * from interrupt level when the buffer containing the dquot is + * flushed to disk. It is responsible for removing the dquot logitem + * from the AIL if it has not been re-logged, and unlocking the dquot's + * flush lock. This behavior is very similar to that of inodes.. + */ +STATIC void +xfs_qm_dqflush_done( + struct xfs_buf *bp, + struct xfs_log_item *lip) +{ + xfs_dq_logitem_t *qip = (struct xfs_dq_logitem *)lip; + xfs_dquot_t *dqp = qip->qli_dquot; + struct xfs_ail *ailp = lip->li_ailp; + + /* + * We only want to pull the item from the AIL if its + * location in the log has not changed since we started the flush. + * Thus, we only bother if the dquot's lsn has + * not changed. First we check the lsn outside the lock + * since it's cheaper, and then we recheck while + * holding the lock before removing the dquot from the AIL. + */ + if ((lip->li_flags & XFS_LI_IN_AIL) && + lip->li_lsn == qip->qli_flush_lsn) { + + /* xfs_trans_ail_delete() drops the AIL lock. */ + spin_lock(&ailp->xa_lock); + if (lip->li_lsn == qip->qli_flush_lsn) + xfs_trans_ail_delete(ailp, lip, SHUTDOWN_CORRUPT_INCORE); + else + spin_unlock(&ailp->xa_lock); + } + + /* + * Release the dq's flush lock since we're done with it. + */ + xfs_dqfunlock(dqp); +} + +/* + * Write a modified dquot to disk. + * The dquot must be locked and the flush lock too taken by caller. + * The flush lock will not be unlocked until the dquot reaches the disk, + * but the dquot is free to be unlocked and modified by the caller + * in the interim. Dquot is still locked on return. This behavior is + * identical to that of inodes. + */ +int +xfs_qm_dqflush( + struct xfs_dquot *dqp, + struct xfs_buf **bpp) +{ + struct xfs_mount *mp = dqp->q_mount; + struct xfs_buf *bp; + struct xfs_disk_dquot *ddqp; + int error; + + ASSERT(XFS_DQ_IS_LOCKED(dqp)); + ASSERT(!completion_done(&dqp->q_flush)); + + trace_xfs_dqflush(dqp); + + *bpp = NULL; + + xfs_qm_dqunpin_wait(dqp); + + /* + * This may have been unpinned because the filesystem is shutting + * down forcibly. If that's the case we must not write this dquot + * to disk, because the log record didn't make it to disk. + * + * We also have to remove the log item from the AIL in this case, + * as we wait for an emptry AIL as part of the unmount process. + */ + if (XFS_FORCED_SHUTDOWN(mp)) { + struct xfs_log_item *lip = &dqp->q_logitem.qli_item; + dqp->dq_flags &= ~XFS_DQ_DIRTY; + + spin_lock(&mp->m_ail->xa_lock); + if (lip->li_flags & XFS_LI_IN_AIL) + xfs_trans_ail_delete(mp->m_ail, lip, + SHUTDOWN_CORRUPT_INCORE); + else + spin_unlock(&mp->m_ail->xa_lock); + error = -EIO; + goto out_unlock; + } + + /* + * Get the buffer containing the on-disk dquot + */ + error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno, + mp->m_quotainfo->qi_dqchunklen, 0, &bp, + &xfs_dquot_buf_ops); + if (error) + goto out_unlock; + + /* + * Calculate the location of the dquot inside the buffer. + */ + ddqp = bp->b_addr + dqp->q_bufoffset; + + /* + * A simple sanity check in case we got a corrupted dquot.. + */ + error = xfs_dqcheck(mp, &dqp->q_core, be32_to_cpu(ddqp->d_id), 0, + XFS_QMOPT_DOWARN, "dqflush (incore copy)"); + if (error) { + xfs_buf_relse(bp); + xfs_dqfunlock(dqp); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + return -EIO; + } + + /* This is the only portion of data that needs to persist */ + memcpy(ddqp, &dqp->q_core, sizeof(xfs_disk_dquot_t)); + + /* + * Clear the dirty field and remember the flush lsn for later use. + */ + dqp->dq_flags &= ~XFS_DQ_DIRTY; + + xfs_trans_ail_copy_lsn(mp->m_ail, &dqp->q_logitem.qli_flush_lsn, + &dqp->q_logitem.qli_item.li_lsn); + + /* + * copy the lsn into the on-disk dquot now while we have the in memory + * dquot here. This can't be done later in the write verifier as we + * can't get access to the log item at that point in time. + * + * We also calculate the CRC here so that the on-disk dquot in the + * buffer always has a valid CRC. This ensures there is no possibility + * of a dquot without an up-to-date CRC getting to disk. + */ + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_dqblk *dqb = (struct xfs_dqblk *)ddqp; + + dqb->dd_lsn = cpu_to_be64(dqp->q_logitem.qli_item.li_lsn); + xfs_update_cksum((char *)dqb, sizeof(struct xfs_dqblk), + XFS_DQUOT_CRC_OFF); + } + + /* + * Attach an iodone routine so that we can remove this dquot from the + * AIL and release the flush lock once the dquot is synced to disk. + */ + xfs_buf_attach_iodone(bp, xfs_qm_dqflush_done, + &dqp->q_logitem.qli_item); + + /* + * If the buffer is pinned then push on the log so we won't + * get stuck waiting in the write for too long. + */ + if (xfs_buf_ispinned(bp)) { + trace_xfs_dqflush_force(dqp); + xfs_log_force(mp, 0); + } + + trace_xfs_dqflush_done(dqp); + *bpp = bp; + return 0; + +out_unlock: + xfs_dqfunlock(dqp); + return -EIO; +} + +/* + * Lock two xfs_dquot structures. + * + * To avoid deadlocks we always lock the quota structure with + * the lowerd id first. + */ +void +xfs_dqlock2( + xfs_dquot_t *d1, + xfs_dquot_t *d2) +{ + if (d1 && d2) { + ASSERT(d1 != d2); + if (be32_to_cpu(d1->q_core.d_id) > + be32_to_cpu(d2->q_core.d_id)) { + mutex_lock(&d2->q_qlock); + mutex_lock_nested(&d1->q_qlock, XFS_QLOCK_NESTED); + } else { + mutex_lock(&d1->q_qlock); + mutex_lock_nested(&d2->q_qlock, XFS_QLOCK_NESTED); + } + } else if (d1) { + mutex_lock(&d1->q_qlock); + } else if (d2) { + mutex_lock(&d2->q_qlock); + } +} + +int __init +xfs_qm_init(void) +{ + xfs_qm_dqzone = + kmem_zone_init(sizeof(struct xfs_dquot), "xfs_dquot"); + if (!xfs_qm_dqzone) + goto out; + + xfs_qm_dqtrxzone = + kmem_zone_init(sizeof(struct xfs_dquot_acct), "xfs_dqtrx"); + if (!xfs_qm_dqtrxzone) + goto out_free_dqzone; + + return 0; + +out_free_dqzone: + kmem_zone_destroy(xfs_qm_dqzone); +out: + return -ENOMEM; +} + +void +xfs_qm_exit(void) +{ + kmem_zone_destroy(xfs_qm_dqtrxzone); + kmem_zone_destroy(xfs_qm_dqzone); +} diff --git a/kernel/fs/xfs/xfs_dquot.h b/kernel/fs/xfs/xfs_dquot.h new file mode 100644 index 000000000..2f536f33c --- /dev/null +++ b/kernel/fs/xfs/xfs_dquot.h @@ -0,0 +1,188 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_DQUOT_H__ +#define __XFS_DQUOT_H__ + +/* + * Dquots are structures that hold quota information about a user or a group, + * much like inodes are for files. In fact, dquots share many characteristics + * with inodes. However, dquots can also be a centralized resource, relative + * to a collection of inodes. In this respect, dquots share some characteristics + * of the superblock. + * XFS dquots exploit both those in its algorithms. They make every attempt + * to not be a bottleneck when quotas are on and have minimal impact, if any, + * when quotas are off. + */ + +struct xfs_mount; +struct xfs_trans; + +enum { + XFS_QLOWSP_1_PCNT = 0, + XFS_QLOWSP_3_PCNT, + XFS_QLOWSP_5_PCNT, + XFS_QLOWSP_MAX +}; + +/* + * The incore dquot structure + */ +typedef struct xfs_dquot { + uint dq_flags; /* various flags (XFS_DQ_*) */ + struct list_head q_lru; /* global free list of dquots */ + struct xfs_mount*q_mount; /* filesystem this relates to */ + struct xfs_trans*q_transp; /* trans this belongs to currently */ + uint q_nrefs; /* # active refs from inodes */ + xfs_daddr_t q_blkno; /* blkno of dquot buffer */ + int q_bufoffset; /* off of dq in buffer (# dquots) */ + xfs_fileoff_t q_fileoffset; /* offset in quotas file */ + + xfs_disk_dquot_t q_core; /* actual usage & quotas */ + xfs_dq_logitem_t q_logitem; /* dquot log item */ + xfs_qcnt_t q_res_bcount; /* total regular nblks used+reserved */ + xfs_qcnt_t q_res_icount; /* total inos allocd+reserved */ + xfs_qcnt_t q_res_rtbcount;/* total realtime blks used+reserved */ + xfs_qcnt_t q_prealloc_lo_wmark;/* prealloc throttle wmark */ + xfs_qcnt_t q_prealloc_hi_wmark;/* prealloc disabled wmark */ + int64_t q_low_space[XFS_QLOWSP_MAX]; + struct mutex q_qlock; /* quota lock */ + struct completion q_flush; /* flush completion queue */ + atomic_t q_pincount; /* dquot pin count */ + wait_queue_head_t q_pinwait; /* dquot pinning wait queue */ +} xfs_dquot_t; + +/* + * Lock hierarchy for q_qlock: + * XFS_QLOCK_NORMAL is the implicit default, + * XFS_QLOCK_NESTED is the dquot with the higher id in xfs_dqlock2 + */ +enum { + XFS_QLOCK_NORMAL = 0, + XFS_QLOCK_NESTED, +}; + +/* + * Manage the q_flush completion queue embedded in the dquot. This completion + * queue synchronizes processes attempting to flush the in-core dquot back to + * disk. + */ +static inline void xfs_dqflock(xfs_dquot_t *dqp) +{ + wait_for_completion(&dqp->q_flush); +} + +static inline bool xfs_dqflock_nowait(xfs_dquot_t *dqp) +{ + return try_wait_for_completion(&dqp->q_flush); +} + +static inline void xfs_dqfunlock(xfs_dquot_t *dqp) +{ + complete(&dqp->q_flush); +} + +static inline int xfs_dqlock_nowait(struct xfs_dquot *dqp) +{ + return mutex_trylock(&dqp->q_qlock); +} + +static inline void xfs_dqlock(struct xfs_dquot *dqp) +{ + mutex_lock(&dqp->q_qlock); +} + +static inline void xfs_dqunlock(struct xfs_dquot *dqp) +{ + mutex_unlock(&dqp->q_qlock); +} + +static inline int xfs_this_quota_on(struct xfs_mount *mp, int type) +{ + switch (type & XFS_DQ_ALLTYPES) { + case XFS_DQ_USER: + return XFS_IS_UQUOTA_ON(mp); + case XFS_DQ_GROUP: + return XFS_IS_GQUOTA_ON(mp); + case XFS_DQ_PROJ: + return XFS_IS_PQUOTA_ON(mp); + default: + return 0; + } +} + +static inline xfs_dquot_t *xfs_inode_dquot(struct xfs_inode *ip, int type) +{ + switch (type & XFS_DQ_ALLTYPES) { + case XFS_DQ_USER: + return ip->i_udquot; + case XFS_DQ_GROUP: + return ip->i_gdquot; + case XFS_DQ_PROJ: + return ip->i_pdquot; + default: + return NULL; + } +} + +/* + * Check whether a dquot is under low free space conditions. We assume the quota + * is enabled and enforced. + */ +static inline bool xfs_dquot_lowsp(struct xfs_dquot *dqp) +{ + int64_t freesp; + + freesp = be64_to_cpu(dqp->q_core.d_blk_hardlimit) - dqp->q_res_bcount; + if (freesp < dqp->q_low_space[XFS_QLOWSP_1_PCNT]) + return true; + + return false; +} + +#define XFS_DQ_IS_LOCKED(dqp) (mutex_is_locked(&((dqp)->q_qlock))) +#define XFS_DQ_IS_DIRTY(dqp) ((dqp)->dq_flags & XFS_DQ_DIRTY) +#define XFS_QM_ISUDQ(dqp) ((dqp)->dq_flags & XFS_DQ_USER) +#define XFS_QM_ISPDQ(dqp) ((dqp)->dq_flags & XFS_DQ_PROJ) +#define XFS_QM_ISGDQ(dqp) ((dqp)->dq_flags & XFS_DQ_GROUP) + +extern int xfs_qm_dqread(struct xfs_mount *, xfs_dqid_t, uint, + uint, struct xfs_dquot **); +extern void xfs_qm_dqdestroy(xfs_dquot_t *); +extern int xfs_qm_dqflush(struct xfs_dquot *, struct xfs_buf **); +extern void xfs_qm_dqunpin_wait(xfs_dquot_t *); +extern void xfs_qm_adjust_dqtimers(xfs_mount_t *, + xfs_disk_dquot_t *); +extern void xfs_qm_adjust_dqlimits(struct xfs_mount *, + struct xfs_dquot *); +extern int xfs_qm_dqget(xfs_mount_t *, xfs_inode_t *, + xfs_dqid_t, uint, uint, xfs_dquot_t **); +extern void xfs_qm_dqput(xfs_dquot_t *); + +extern void xfs_dqlock2(struct xfs_dquot *, struct xfs_dquot *); + +extern void xfs_dquot_set_prealloc_limits(struct xfs_dquot *); + +static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp) +{ + xfs_dqlock(dqp); + dqp->q_nrefs++; + xfs_dqunlock(dqp); + return dqp; +} + +#endif /* __XFS_DQUOT_H__ */ diff --git a/kernel/fs/xfs/xfs_dquot_item.c b/kernel/fs/xfs/xfs_dquot_item.c new file mode 100644 index 000000000..814cff94e --- /dev/null +++ b/kernel/fs/xfs/xfs_dquot_item.c @@ -0,0 +1,443 @@ +/* + * Copyright (c) 2000-2003 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_quota.h" +#include "xfs_error.h" +#include "xfs_trans.h" +#include "xfs_buf_item.h" +#include "xfs_trans_priv.h" +#include "xfs_qm.h" +#include "xfs_log.h" + +static inline struct xfs_dq_logitem *DQUOT_ITEM(struct xfs_log_item *lip) +{ + return container_of(lip, struct xfs_dq_logitem, qli_item); +} + +/* + * returns the number of iovecs needed to log the given dquot item. + */ +STATIC void +xfs_qm_dquot_logitem_size( + struct xfs_log_item *lip, + int *nvecs, + int *nbytes) +{ + *nvecs += 2; + *nbytes += sizeof(struct xfs_dq_logformat) + + sizeof(struct xfs_disk_dquot); +} + +/* + * fills in the vector of log iovecs for the given dquot log item. + */ +STATIC void +xfs_qm_dquot_logitem_format( + struct xfs_log_item *lip, + struct xfs_log_vec *lv) +{ + struct xfs_dq_logitem *qlip = DQUOT_ITEM(lip); + struct xfs_log_iovec *vecp = NULL; + struct xfs_dq_logformat *qlf; + + qlf = xlog_prepare_iovec(lv, &vecp, XLOG_REG_TYPE_QFORMAT); + qlf->qlf_type = XFS_LI_DQUOT; + qlf->qlf_size = 2; + qlf->qlf_id = be32_to_cpu(qlip->qli_dquot->q_core.d_id); + qlf->qlf_blkno = qlip->qli_dquot->q_blkno; + qlf->qlf_len = 1; + qlf->qlf_boffset = qlip->qli_dquot->q_bufoffset; + xlog_finish_iovec(lv, vecp, sizeof(struct xfs_dq_logformat)); + + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_DQUOT, + &qlip->qli_dquot->q_core, + sizeof(struct xfs_disk_dquot)); +} + +/* + * Increment the pin count of the given dquot. + */ +STATIC void +xfs_qm_dquot_logitem_pin( + struct xfs_log_item *lip) +{ + struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot; + + ASSERT(XFS_DQ_IS_LOCKED(dqp)); + atomic_inc(&dqp->q_pincount); +} + +/* + * Decrement the pin count of the given dquot, and wake up + * anyone in xfs_dqwait_unpin() if the count goes to 0. The + * dquot must have been previously pinned with a call to + * xfs_qm_dquot_logitem_pin(). + */ +STATIC void +xfs_qm_dquot_logitem_unpin( + struct xfs_log_item *lip, + int remove) +{ + struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot; + + ASSERT(atomic_read(&dqp->q_pincount) > 0); + if (atomic_dec_and_test(&dqp->q_pincount)) + wake_up(&dqp->q_pinwait); +} + +STATIC xfs_lsn_t +xfs_qm_dquot_logitem_committed( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ + /* + * We always re-log the entire dquot when it becomes dirty, + * so, the latest copy _is_ the only one that matters. + */ + return lsn; +} + +/* + * This is called to wait for the given dquot to be unpinned. + * Most of these pin/unpin routines are plagiarized from inode code. + */ +void +xfs_qm_dqunpin_wait( + struct xfs_dquot *dqp) +{ + ASSERT(XFS_DQ_IS_LOCKED(dqp)); + if (atomic_read(&dqp->q_pincount) == 0) + return; + + /* + * Give the log a push so we don't wait here too long. + */ + xfs_log_force(dqp->q_mount, 0); + wait_event(dqp->q_pinwait, (atomic_read(&dqp->q_pincount) == 0)); +} + +STATIC uint +xfs_qm_dquot_logitem_push( + struct xfs_log_item *lip, + struct list_head *buffer_list) __releases(&lip->li_ailp->xa_lock) + __acquires(&lip->li_ailp->xa_lock) +{ + struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot; + struct xfs_buf *bp = NULL; + uint rval = XFS_ITEM_SUCCESS; + int error; + + if (atomic_read(&dqp->q_pincount) > 0) + return XFS_ITEM_PINNED; + + if (!xfs_dqlock_nowait(dqp)) + return XFS_ITEM_LOCKED; + + /* + * Re-check the pincount now that we stabilized the value by + * taking the quota lock. + */ + if (atomic_read(&dqp->q_pincount) > 0) { + rval = XFS_ITEM_PINNED; + goto out_unlock; + } + + /* + * Someone else is already flushing the dquot. Nothing we can do + * here but wait for the flush to finish and remove the item from + * the AIL. + */ + if (!xfs_dqflock_nowait(dqp)) { + rval = XFS_ITEM_FLUSHING; + goto out_unlock; + } + + spin_unlock(&lip->li_ailp->xa_lock); + + error = xfs_qm_dqflush(dqp, &bp); + if (error) { + xfs_warn(dqp->q_mount, "%s: push error %d on dqp %p", + __func__, error, dqp); + } else { + if (!xfs_buf_delwri_queue(bp, buffer_list)) + rval = XFS_ITEM_FLUSHING; + xfs_buf_relse(bp); + } + + spin_lock(&lip->li_ailp->xa_lock); +out_unlock: + xfs_dqunlock(dqp); + return rval; +} + +/* + * Unlock the dquot associated with the log item. + * Clear the fields of the dquot and dquot log item that + * are specific to the current transaction. If the + * hold flags is set, do not unlock the dquot. + */ +STATIC void +xfs_qm_dquot_logitem_unlock( + struct xfs_log_item *lip) +{ + struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot; + + ASSERT(XFS_DQ_IS_LOCKED(dqp)); + + /* + * Clear the transaction pointer in the dquot + */ + dqp->q_transp = NULL; + + /* + * dquots are never 'held' from getting unlocked at the end of + * a transaction. Their locking and unlocking is hidden inside the + * transaction layer, within trans_commit. Hence, no LI_HOLD flag + * for the logitem. + */ + xfs_dqunlock(dqp); +} + +/* + * this needs to stamp an lsn into the dquot, I think. + * rpc's that look at user dquot's would then have to + * push on the dependency recorded in the dquot + */ +STATIC void +xfs_qm_dquot_logitem_committing( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ +} + +/* + * This is the ops vector for dquots + */ +static const struct xfs_item_ops xfs_dquot_item_ops = { + .iop_size = xfs_qm_dquot_logitem_size, + .iop_format = xfs_qm_dquot_logitem_format, + .iop_pin = xfs_qm_dquot_logitem_pin, + .iop_unpin = xfs_qm_dquot_logitem_unpin, + .iop_unlock = xfs_qm_dquot_logitem_unlock, + .iop_committed = xfs_qm_dquot_logitem_committed, + .iop_push = xfs_qm_dquot_logitem_push, + .iop_committing = xfs_qm_dquot_logitem_committing +}; + +/* + * Initialize the dquot log item for a newly allocated dquot. + * The dquot isn't locked at this point, but it isn't on any of the lists + * either, so we don't care. + */ +void +xfs_qm_dquot_logitem_init( + struct xfs_dquot *dqp) +{ + struct xfs_dq_logitem *lp = &dqp->q_logitem; + + xfs_log_item_init(dqp->q_mount, &lp->qli_item, XFS_LI_DQUOT, + &xfs_dquot_item_ops); + lp->qli_dquot = dqp; +} + +/*------------------ QUOTAOFF LOG ITEMS -------------------*/ + +static inline struct xfs_qoff_logitem *QOFF_ITEM(struct xfs_log_item *lip) +{ + return container_of(lip, struct xfs_qoff_logitem, qql_item); +} + + +/* + * This returns the number of iovecs needed to log the given quotaoff item. + * We only need 1 iovec for an quotaoff item. It just logs the + * quotaoff_log_format structure. + */ +STATIC void +xfs_qm_qoff_logitem_size( + struct xfs_log_item *lip, + int *nvecs, + int *nbytes) +{ + *nvecs += 1; + *nbytes += sizeof(struct xfs_qoff_logitem); +} + +STATIC void +xfs_qm_qoff_logitem_format( + struct xfs_log_item *lip, + struct xfs_log_vec *lv) +{ + struct xfs_qoff_logitem *qflip = QOFF_ITEM(lip); + struct xfs_log_iovec *vecp = NULL; + struct xfs_qoff_logformat *qlf; + + qlf = xlog_prepare_iovec(lv, &vecp, XLOG_REG_TYPE_QUOTAOFF); + qlf->qf_type = XFS_LI_QUOTAOFF; + qlf->qf_size = 1; + qlf->qf_flags = qflip->qql_flags; + xlog_finish_iovec(lv, vecp, sizeof(struct xfs_qoff_logitem)); +} + +/* + * Pinning has no meaning for an quotaoff item, so just return. + */ +STATIC void +xfs_qm_qoff_logitem_pin( + struct xfs_log_item *lip) +{ +} + +/* + * Since pinning has no meaning for an quotaoff item, unpinning does + * not either. + */ +STATIC void +xfs_qm_qoff_logitem_unpin( + struct xfs_log_item *lip, + int remove) +{ +} + +/* + * There isn't much you can do to push a quotaoff item. It is simply + * stuck waiting for the log to be flushed to disk. + */ +STATIC uint +xfs_qm_qoff_logitem_push( + struct xfs_log_item *lip, + struct list_head *buffer_list) +{ + return XFS_ITEM_LOCKED; +} + +/* + * Quotaoff items have no locking or pushing, so return failure + * so that the caller doesn't bother with us. + */ +STATIC void +xfs_qm_qoff_logitem_unlock( + struct xfs_log_item *lip) +{ +} + +/* + * The quotaoff-start-item is logged only once and cannot be moved in the log, + * so simply return the lsn at which it's been logged. + */ +STATIC xfs_lsn_t +xfs_qm_qoff_logitem_committed( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ + return lsn; +} + +STATIC xfs_lsn_t +xfs_qm_qoffend_logitem_committed( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ + struct xfs_qoff_logitem *qfe = QOFF_ITEM(lip); + struct xfs_qoff_logitem *qfs = qfe->qql_start_lip; + struct xfs_ail *ailp = qfs->qql_item.li_ailp; + + /* + * Delete the qoff-start logitem from the AIL. + * xfs_trans_ail_delete() drops the AIL lock. + */ + spin_lock(&ailp->xa_lock); + xfs_trans_ail_delete(ailp, &qfs->qql_item, SHUTDOWN_LOG_IO_ERROR); + + kmem_free(qfs); + kmem_free(qfe); + return (xfs_lsn_t)-1; +} + +/* + * XXX rcc - don't know quite what to do with this. I think we can + * just ignore it. The only time that isn't the case is if we allow + * the client to somehow see that quotas have been turned off in which + * we can't allow that to get back until the quotaoff hits the disk. + * So how would that happen? Also, do we need different routines for + * quotaoff start and quotaoff end? I suspect the answer is yes but + * to be sure, I need to look at the recovery code and see how quota off + * recovery is handled (do we roll forward or back or do something else). + * If we roll forwards or backwards, then we need two separate routines, + * one that does nothing and one that stamps in the lsn that matters + * (truly makes the quotaoff irrevocable). If we do something else, + * then maybe we don't need two. + */ +STATIC void +xfs_qm_qoff_logitem_committing( + struct xfs_log_item *lip, + xfs_lsn_t commit_lsn) +{ +} + +static const struct xfs_item_ops xfs_qm_qoffend_logitem_ops = { + .iop_size = xfs_qm_qoff_logitem_size, + .iop_format = xfs_qm_qoff_logitem_format, + .iop_pin = xfs_qm_qoff_logitem_pin, + .iop_unpin = xfs_qm_qoff_logitem_unpin, + .iop_unlock = xfs_qm_qoff_logitem_unlock, + .iop_committed = xfs_qm_qoffend_logitem_committed, + .iop_push = xfs_qm_qoff_logitem_push, + .iop_committing = xfs_qm_qoff_logitem_committing +}; + +/* + * This is the ops vector shared by all quotaoff-start log items. + */ +static const struct xfs_item_ops xfs_qm_qoff_logitem_ops = { + .iop_size = xfs_qm_qoff_logitem_size, + .iop_format = xfs_qm_qoff_logitem_format, + .iop_pin = xfs_qm_qoff_logitem_pin, + .iop_unpin = xfs_qm_qoff_logitem_unpin, + .iop_unlock = xfs_qm_qoff_logitem_unlock, + .iop_committed = xfs_qm_qoff_logitem_committed, + .iop_push = xfs_qm_qoff_logitem_push, + .iop_committing = xfs_qm_qoff_logitem_committing +}; + +/* + * Allocate and initialize an quotaoff item of the correct quota type(s). + */ +struct xfs_qoff_logitem * +xfs_qm_qoff_logitem_init( + struct xfs_mount *mp, + struct xfs_qoff_logitem *start, + uint flags) +{ + struct xfs_qoff_logitem *qf; + + qf = kmem_zalloc(sizeof(struct xfs_qoff_logitem), KM_SLEEP); + + xfs_log_item_init(mp, &qf->qql_item, XFS_LI_QUOTAOFF, start ? + &xfs_qm_qoffend_logitem_ops : &xfs_qm_qoff_logitem_ops); + qf->qql_item.li_mountp = mp; + qf->qql_start_lip = start; + qf->qql_flags = flags; + return qf; +} diff --git a/kernel/fs/xfs/xfs_dquot_item.h b/kernel/fs/xfs/xfs_dquot_item.h new file mode 100644 index 000000000..502e94646 --- /dev/null +++ b/kernel/fs/xfs/xfs_dquot_item.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2000-2003 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_DQUOT_ITEM_H__ +#define __XFS_DQUOT_ITEM_H__ + +struct xfs_dquot; +struct xfs_trans; +struct xfs_mount; +struct xfs_qoff_logitem; + +typedef struct xfs_dq_logitem { + xfs_log_item_t qli_item; /* common portion */ + struct xfs_dquot *qli_dquot; /* dquot ptr */ + xfs_lsn_t qli_flush_lsn; /* lsn at last flush */ +} xfs_dq_logitem_t; + +typedef struct xfs_qoff_logitem { + xfs_log_item_t qql_item; /* common portion */ + struct xfs_qoff_logitem *qql_start_lip; /* qoff-start logitem, if any */ + unsigned int qql_flags; +} xfs_qoff_logitem_t; + + +extern void xfs_qm_dquot_logitem_init(struct xfs_dquot *); +extern xfs_qoff_logitem_t *xfs_qm_qoff_logitem_init(struct xfs_mount *, + struct xfs_qoff_logitem *, uint); +extern xfs_qoff_logitem_t *xfs_trans_get_qoff_item(struct xfs_trans *, + struct xfs_qoff_logitem *, uint); +extern void xfs_trans_log_quotaoff_item(struct xfs_trans *, + struct xfs_qoff_logitem *); + +#endif /* __XFS_DQUOT_ITEM_H__ */ diff --git a/kernel/fs/xfs/xfs_error.c b/kernel/fs/xfs/xfs_error.c new file mode 100644 index 000000000..338e50bbf --- /dev/null +++ b/kernel/fs/xfs/xfs_error.c @@ -0,0 +1,180 @@ +/* + * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_format.h" +#include "xfs_fs.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_error.h" + +#ifdef DEBUG + +int xfs_etest[XFS_NUM_INJECT_ERROR]; +int64_t xfs_etest_fsid[XFS_NUM_INJECT_ERROR]; +char * xfs_etest_fsname[XFS_NUM_INJECT_ERROR]; +int xfs_error_test_active; + +int +xfs_error_test(int error_tag, int *fsidp, char *expression, + int line, char *file, unsigned long randfactor) +{ + int i; + int64_t fsid; + + if (prandom_u32() % randfactor) + return 0; + + memcpy(&fsid, fsidp, sizeof(xfs_fsid_t)); + + for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) { + if (xfs_etest[i] == error_tag && xfs_etest_fsid[i] == fsid) { + xfs_warn(NULL, + "Injecting error (%s) at file %s, line %d, on filesystem \"%s\"", + expression, file, line, xfs_etest_fsname[i]); + return 1; + } + } + + return 0; +} + +int +xfs_errortag_add(int error_tag, xfs_mount_t *mp) +{ + int i; + int len; + int64_t fsid; + + memcpy(&fsid, mp->m_fixedfsid, sizeof(xfs_fsid_t)); + + for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) { + if (xfs_etest_fsid[i] == fsid && xfs_etest[i] == error_tag) { + xfs_warn(mp, "error tag #%d on", error_tag); + return 0; + } + } + + for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) { + if (xfs_etest[i] == 0) { + xfs_warn(mp, "Turned on XFS error tag #%d", + error_tag); + xfs_etest[i] = error_tag; + xfs_etest_fsid[i] = fsid; + len = strlen(mp->m_fsname); + xfs_etest_fsname[i] = kmem_alloc(len + 1, KM_SLEEP); + strcpy(xfs_etest_fsname[i], mp->m_fsname); + xfs_error_test_active++; + return 0; + } + } + + xfs_warn(mp, "error tag overflow, too many turned on"); + + return 1; +} + +int +xfs_errortag_clearall(xfs_mount_t *mp, int loud) +{ + int64_t fsid; + int cleared = 0; + int i; + + memcpy(&fsid, mp->m_fixedfsid, sizeof(xfs_fsid_t)); + + + for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) { + if ((fsid == 0LL || xfs_etest_fsid[i] == fsid) && + xfs_etest[i] != 0) { + cleared = 1; + xfs_warn(mp, "Clearing XFS error tag #%d", + xfs_etest[i]); + xfs_etest[i] = 0; + xfs_etest_fsid[i] = 0LL; + kmem_free(xfs_etest_fsname[i]); + xfs_etest_fsname[i] = NULL; + xfs_error_test_active--; + } + } + + if (loud || cleared) + xfs_warn(mp, "Cleared all XFS error tags for filesystem"); + + return 0; +} +#endif /* DEBUG */ + +void +xfs_error_report( + const char *tag, + int level, + struct xfs_mount *mp, + const char *filename, + int linenum, + inst_t *ra) +{ + if (level <= xfs_error_level) { + xfs_alert_tag(mp, XFS_PTAG_ERROR_REPORT, + "Internal error %s at line %d of file %s. Caller %pS", + tag, linenum, filename, ra); + + xfs_stack_trace(); + } +} + +void +xfs_corruption_error( + const char *tag, + int level, + struct xfs_mount *mp, + void *p, + const char *filename, + int linenum, + inst_t *ra) +{ + if (level <= xfs_error_level) + xfs_hex_dump(p, 64); + xfs_error_report(tag, level, mp, filename, linenum, ra); + xfs_alert(mp, "Corruption detected. Unmount and run xfs_repair"); +} + +/* + * Warnings specifically for verifier errors. Differentiate CRC vs. invalid + * values, and omit the stack trace unless the error level is tuned high. + */ +void +xfs_verifier_error( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + + xfs_alert(mp, "Metadata %s detected at %pF, block 0x%llx", + bp->b_error == -EFSBADCRC ? "CRC error" : "corruption", + __return_address, bp->b_bn); + + xfs_alert(mp, "Unmount and run xfs_repair"); + + if (xfs_error_level >= XFS_ERRLEVEL_LOW) { + xfs_alert(mp, "First 64 bytes of corrupted metadata buffer:"); + xfs_hex_dump(xfs_buf_offset(bp, 0), 64); + } + + if (xfs_error_level >= XFS_ERRLEVEL_HIGH) + xfs_stack_trace(); +} diff --git a/kernel/fs/xfs/xfs_error.h b/kernel/fs/xfs/xfs_error.h new file mode 100644 index 000000000..c0394ed12 --- /dev/null +++ b/kernel/fs/xfs/xfs_error.h @@ -0,0 +1,153 @@ +/* + * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_ERROR_H__ +#define __XFS_ERROR_H__ + +struct xfs_mount; + +extern void xfs_error_report(const char *tag, int level, struct xfs_mount *mp, + const char *filename, int linenum, inst_t *ra); +extern void xfs_corruption_error(const char *tag, int level, + struct xfs_mount *mp, void *p, const char *filename, + int linenum, inst_t *ra); +extern void xfs_verifier_error(struct xfs_buf *bp); + +#define XFS_ERROR_REPORT(e, lvl, mp) \ + xfs_error_report(e, lvl, mp, __FILE__, __LINE__, __return_address) +#define XFS_CORRUPTION_ERROR(e, lvl, mp, mem) \ + xfs_corruption_error(e, lvl, mp, mem, \ + __FILE__, __LINE__, __return_address) + +#define XFS_ERRLEVEL_OFF 0 +#define XFS_ERRLEVEL_LOW 1 +#define XFS_ERRLEVEL_HIGH 5 + +/* + * Macros to set EFSCORRUPTED & return/branch. + */ +#define XFS_WANT_CORRUPTED_GOTO(mp, x, l) \ + { \ + int fs_is_ok = (x); \ + ASSERT(fs_is_ok); \ + if (unlikely(!fs_is_ok)) { \ + XFS_ERROR_REPORT("XFS_WANT_CORRUPTED_GOTO", \ + XFS_ERRLEVEL_LOW, mp); \ + error = -EFSCORRUPTED; \ + goto l; \ + } \ + } + +#define XFS_WANT_CORRUPTED_RETURN(mp, x) \ + { \ + int fs_is_ok = (x); \ + ASSERT(fs_is_ok); \ + if (unlikely(!fs_is_ok)) { \ + XFS_ERROR_REPORT("XFS_WANT_CORRUPTED_RETURN", \ + XFS_ERRLEVEL_LOW, mp); \ + return -EFSCORRUPTED; \ + } \ + } + +/* + * error injection tags - the labels can be anything you want + * but each tag should have its own unique number + */ + +#define XFS_ERRTAG_NOERROR 0 +#define XFS_ERRTAG_IFLUSH_1 1 +#define XFS_ERRTAG_IFLUSH_2 2 +#define XFS_ERRTAG_IFLUSH_3 3 +#define XFS_ERRTAG_IFLUSH_4 4 +#define XFS_ERRTAG_IFLUSH_5 5 +#define XFS_ERRTAG_IFLUSH_6 6 +#define XFS_ERRTAG_DA_READ_BUF 7 +#define XFS_ERRTAG_BTREE_CHECK_LBLOCK 8 +#define XFS_ERRTAG_BTREE_CHECK_SBLOCK 9 +#define XFS_ERRTAG_ALLOC_READ_AGF 10 +#define XFS_ERRTAG_IALLOC_READ_AGI 11 +#define XFS_ERRTAG_ITOBP_INOTOBP 12 +#define XFS_ERRTAG_IUNLINK 13 +#define XFS_ERRTAG_IUNLINK_REMOVE 14 +#define XFS_ERRTAG_DIR_INO_VALIDATE 15 +#define XFS_ERRTAG_BULKSTAT_READ_CHUNK 16 +#define XFS_ERRTAG_IODONE_IOERR 17 +#define XFS_ERRTAG_STRATREAD_IOERR 18 +#define XFS_ERRTAG_STRATCMPL_IOERR 19 +#define XFS_ERRTAG_DIOWRITE_IOERR 20 +#define XFS_ERRTAG_BMAPIFORMAT 21 +#define XFS_ERRTAG_MAX 22 + +/* + * Random factors for above tags, 1 means always, 2 means 1/2 time, etc. + */ +#define XFS_RANDOM_DEFAULT 100 +#define XFS_RANDOM_IFLUSH_1 XFS_RANDOM_DEFAULT +#define XFS_RANDOM_IFLUSH_2 XFS_RANDOM_DEFAULT +#define XFS_RANDOM_IFLUSH_3 XFS_RANDOM_DEFAULT +#define XFS_RANDOM_IFLUSH_4 XFS_RANDOM_DEFAULT +#define XFS_RANDOM_IFLUSH_5 XFS_RANDOM_DEFAULT +#define XFS_RANDOM_IFLUSH_6 XFS_RANDOM_DEFAULT +#define XFS_RANDOM_DA_READ_BUF XFS_RANDOM_DEFAULT +#define XFS_RANDOM_BTREE_CHECK_LBLOCK (XFS_RANDOM_DEFAULT/4) +#define XFS_RANDOM_BTREE_CHECK_SBLOCK XFS_RANDOM_DEFAULT +#define XFS_RANDOM_ALLOC_READ_AGF XFS_RANDOM_DEFAULT +#define XFS_RANDOM_IALLOC_READ_AGI XFS_RANDOM_DEFAULT +#define XFS_RANDOM_ITOBP_INOTOBP XFS_RANDOM_DEFAULT +#define XFS_RANDOM_IUNLINK XFS_RANDOM_DEFAULT +#define XFS_RANDOM_IUNLINK_REMOVE XFS_RANDOM_DEFAULT +#define XFS_RANDOM_DIR_INO_VALIDATE XFS_RANDOM_DEFAULT +#define XFS_RANDOM_BULKSTAT_READ_CHUNK XFS_RANDOM_DEFAULT +#define XFS_RANDOM_IODONE_IOERR (XFS_RANDOM_DEFAULT/10) +#define XFS_RANDOM_STRATREAD_IOERR (XFS_RANDOM_DEFAULT/10) +#define XFS_RANDOM_STRATCMPL_IOERR (XFS_RANDOM_DEFAULT/10) +#define XFS_RANDOM_DIOWRITE_IOERR (XFS_RANDOM_DEFAULT/10) +#define XFS_RANDOM_BMAPIFORMAT XFS_RANDOM_DEFAULT + +#ifdef DEBUG +extern int xfs_error_test_active; +extern int xfs_error_test(int, int *, char *, int, char *, unsigned long); + +#define XFS_NUM_INJECT_ERROR 10 +#define XFS_TEST_ERROR(expr, mp, tag, rf) \ + ((expr) || (xfs_error_test_active && \ + xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \ + (rf)))) + +extern int xfs_errortag_add(int error_tag, struct xfs_mount *mp); +extern int xfs_errortag_clearall(struct xfs_mount *mp, int loud); +#else +#define XFS_TEST_ERROR(expr, mp, tag, rf) (expr) +#define xfs_errortag_add(tag, mp) (ENOSYS) +#define xfs_errortag_clearall(mp, loud) (ENOSYS) +#endif /* DEBUG */ + +/* + * XFS panic tags -- allow a call to xfs_alert_tag() be turned into + * a panic by setting xfs_panic_mask in a sysctl. + */ +#define XFS_NO_PTAG 0 +#define XFS_PTAG_IFLUSH 0x00000001 +#define XFS_PTAG_LOGRES 0x00000002 +#define XFS_PTAG_AILDELETE 0x00000004 +#define XFS_PTAG_ERROR_REPORT 0x00000008 +#define XFS_PTAG_SHUTDOWN_CORRUPT 0x00000010 +#define XFS_PTAG_SHUTDOWN_IOERROR 0x00000020 +#define XFS_PTAG_SHUTDOWN_LOGERROR 0x00000040 +#define XFS_PTAG_FSBLOCK_ZERO 0x00000080 + +#endif /* __XFS_ERROR_H__ */ diff --git a/kernel/fs/xfs/xfs_export.c b/kernel/fs/xfs/xfs_export.c new file mode 100644 index 000000000..652cd3c5b --- /dev/null +++ b/kernel/fs/xfs/xfs_export.c @@ -0,0 +1,254 @@ +/* + * Copyright (c) 2004-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_dir2.h" +#include "xfs_export.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_inode_item.h" +#include "xfs_trace.h" +#include "xfs_icache.h" +#include "xfs_log.h" +#include "xfs_pnfs.h" + +/* + * Note that we only accept fileids which are long enough rather than allow + * the parent generation number to default to zero. XFS considers zero a + * valid generation number not an invalid/wildcard value. + */ +static int xfs_fileid_length(int fileid_type) +{ + switch (fileid_type) { + case FILEID_INO32_GEN: + return 2; + case FILEID_INO32_GEN_PARENT: + return 4; + case FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG: + return 3; + case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG: + return 6; + } + return FILEID_INVALID; +} + +STATIC int +xfs_fs_encode_fh( + struct inode *inode, + __u32 *fh, + int *max_len, + struct inode *parent) +{ + struct fid *fid = (struct fid *)fh; + struct xfs_fid64 *fid64 = (struct xfs_fid64 *)fh; + int fileid_type; + int len; + + /* Directories don't need their parent encoded, they have ".." */ + if (!parent) + fileid_type = FILEID_INO32_GEN; + else + fileid_type = FILEID_INO32_GEN_PARENT; + + /* + * If the the filesystem may contain 64bit inode numbers, we need + * to use larger file handles that can represent them. + * + * While we only allocate inodes that do not fit into 32 bits any + * large enough filesystem may contain them, thus the slightly + * confusing looking conditional below. + */ + if (!(XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_SMALL_INUMS) || + (XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_32BITINODES)) + fileid_type |= XFS_FILEID_TYPE_64FLAG; + + /* + * Only encode if there is enough space given. In practice + * this means we can't export a filesystem with 64bit inodes + * over NFSv2 with the subtree_check export option; the other + * seven combinations work. The real answer is "don't use v2". + */ + len = xfs_fileid_length(fileid_type); + if (*max_len < len) { + *max_len = len; + return FILEID_INVALID; + } + *max_len = len; + + switch (fileid_type) { + case FILEID_INO32_GEN_PARENT: + fid->i32.parent_ino = XFS_I(parent)->i_ino; + fid->i32.parent_gen = parent->i_generation; + /*FALLTHRU*/ + case FILEID_INO32_GEN: + fid->i32.ino = XFS_I(inode)->i_ino; + fid->i32.gen = inode->i_generation; + break; + case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG: + fid64->parent_ino = XFS_I(parent)->i_ino; + fid64->parent_gen = parent->i_generation; + /*FALLTHRU*/ + case FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG: + fid64->ino = XFS_I(inode)->i_ino; + fid64->gen = inode->i_generation; + break; + } + + return fileid_type; +} + +STATIC struct inode * +xfs_nfs_get_inode( + struct super_block *sb, + u64 ino, + u32 generation) + { + xfs_mount_t *mp = XFS_M(sb); + xfs_inode_t *ip; + int error; + + /* + * NFS can sometimes send requests for ino 0. Fail them gracefully. + */ + if (ino == 0) + return ERR_PTR(-ESTALE); + + /* + * The XFS_IGET_UNTRUSTED means that an invalid inode number is just + * fine and not an indication of a corrupted filesystem as clients can + * send invalid file handles and we have to handle it gracefully.. + */ + error = xfs_iget(mp, NULL, ino, XFS_IGET_UNTRUSTED, 0, &ip); + if (error) { + /* + * EINVAL means the inode cluster doesn't exist anymore. + * This implies the filehandle is stale, so we should + * translate it here. + * We don't use ESTALE directly down the chain to not + * confuse applications using bulkstat that expect EINVAL. + */ + if (error == -EINVAL || error == -ENOENT) + error = -ESTALE; + return ERR_PTR(error); + } + + if (ip->i_d.di_gen != generation) { + IRELE(ip); + return ERR_PTR(-ESTALE); + } + + return VFS_I(ip); +} + +STATIC struct dentry * +xfs_fs_fh_to_dentry(struct super_block *sb, struct fid *fid, + int fh_len, int fileid_type) +{ + struct xfs_fid64 *fid64 = (struct xfs_fid64 *)fid; + struct inode *inode = NULL; + + if (fh_len < xfs_fileid_length(fileid_type)) + return NULL; + + switch (fileid_type) { + case FILEID_INO32_GEN_PARENT: + case FILEID_INO32_GEN: + inode = xfs_nfs_get_inode(sb, fid->i32.ino, fid->i32.gen); + break; + case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG: + case FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG: + inode = xfs_nfs_get_inode(sb, fid64->ino, fid64->gen); + break; + } + + return d_obtain_alias(inode); +} + +STATIC struct dentry * +xfs_fs_fh_to_parent(struct super_block *sb, struct fid *fid, + int fh_len, int fileid_type) +{ + struct xfs_fid64 *fid64 = (struct xfs_fid64 *)fid; + struct inode *inode = NULL; + + if (fh_len < xfs_fileid_length(fileid_type)) + return NULL; + + switch (fileid_type) { + case FILEID_INO32_GEN_PARENT: + inode = xfs_nfs_get_inode(sb, fid->i32.parent_ino, + fid->i32.parent_gen); + break; + case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG: + inode = xfs_nfs_get_inode(sb, fid64->parent_ino, + fid64->parent_gen); + break; + } + + return d_obtain_alias(inode); +} + +STATIC struct dentry * +xfs_fs_get_parent( + struct dentry *child) +{ + int error; + struct xfs_inode *cip; + + error = xfs_lookup(XFS_I(d_inode(child)), &xfs_name_dotdot, &cip, NULL); + if (unlikely(error)) + return ERR_PTR(error); + + return d_obtain_alias(VFS_I(cip)); +} + +STATIC int +xfs_fs_nfs_commit_metadata( + struct inode *inode) +{ + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + xfs_lsn_t lsn = 0; + + xfs_ilock(ip, XFS_ILOCK_SHARED); + if (xfs_ipincount(ip)) + lsn = ip->i_itemp->ili_last_lsn; + xfs_iunlock(ip, XFS_ILOCK_SHARED); + + if (!lsn) + return 0; + return _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL); +} + +const struct export_operations xfs_export_operations = { + .encode_fh = xfs_fs_encode_fh, + .fh_to_dentry = xfs_fs_fh_to_dentry, + .fh_to_parent = xfs_fs_fh_to_parent, + .get_parent = xfs_fs_get_parent, + .commit_metadata = xfs_fs_nfs_commit_metadata, +#ifdef CONFIG_NFSD_PNFS + .get_uuid = xfs_fs_get_uuid, + .map_blocks = xfs_fs_map_blocks, + .commit_blocks = xfs_fs_commit_blocks, +#endif +}; diff --git a/kernel/fs/xfs/xfs_export.h b/kernel/fs/xfs/xfs_export.h new file mode 100644 index 000000000..3272b6ae7 --- /dev/null +++ b/kernel/fs/xfs/xfs_export.h @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_EXPORT_H__ +#define __XFS_EXPORT_H__ + +/* + * Common defines for code related to exporting XFS filesystems over NFS. + * + * The NFS fileid goes out on the wire as an array of + * 32bit unsigned ints in host order. There are 5 possible + * formats. + * + * (1) fileid_type=0x00 + * (no fileid data; handled by the generic code) + * + * (2) fileid_type=0x01 + * inode-num + * generation + * + * (3) fileid_type=0x02 + * inode-num + * generation + * parent-inode-num + * parent-generation + * + * (4) fileid_type=0x81 + * inode-num-lo32 + * inode-num-hi32 + * generation + * + * (5) fileid_type=0x82 + * inode-num-lo32 + * inode-num-hi32 + * generation + * parent-inode-num-lo32 + * parent-inode-num-hi32 + * parent-generation + * + * Note, the NFS filehandle also includes an fsid portion which + * may have an inode number in it. That number is hardcoded to + * 32bits and there is no way for XFS to intercept it. In + * practice this means when exporting an XFS filesystem with 64bit + * inodes you should either export the mountpoint (rather than + * a subdirectory) or use the "fsid" export option. + */ + +struct xfs_fid64 { + u64 ino; + u32 gen; + u64 parent_ino; + u32 parent_gen; +} __attribute__((packed)); + +/* This flag goes on the wire. Don't play with it. */ +#define XFS_FILEID_TYPE_64FLAG 0x80 /* NFS fileid has 64bit inodes */ + +#endif /* __XFS_EXPORT_H__ */ diff --git a/kernel/fs/xfs/xfs_extent_busy.c b/kernel/fs/xfs/xfs_extent_busy.c new file mode 100644 index 000000000..c263e0792 --- /dev/null +++ b/kernel/fs/xfs/xfs_extent_busy.c @@ -0,0 +1,604 @@ +/* + * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. + * Copyright (c) 2010 David Chinner. + * Copyright (c) 2011 Christoph Hellwig. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_shared.h" +#include "xfs_trans_resv.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_alloc.h" +#include "xfs_extent_busy.h" +#include "xfs_trace.h" +#include "xfs_trans.h" +#include "xfs_log.h" + +void +xfs_extent_busy_insert( + struct xfs_trans *tp, + xfs_agnumber_t agno, + xfs_agblock_t bno, + xfs_extlen_t len, + unsigned int flags) +{ + struct xfs_extent_busy *new; + struct xfs_extent_busy *busyp; + struct xfs_perag *pag; + struct rb_node **rbp; + struct rb_node *parent = NULL; + + new = kmem_zalloc(sizeof(struct xfs_extent_busy), KM_MAYFAIL); + if (!new) { + /* + * No Memory! Since it is now not possible to track the free + * block, make this a synchronous transaction to insure that + * the block is not reused before this transaction commits. + */ + trace_xfs_extent_busy_enomem(tp->t_mountp, agno, bno, len); + xfs_trans_set_sync(tp); + return; + } + + new->agno = agno; + new->bno = bno; + new->length = len; + INIT_LIST_HEAD(&new->list); + new->flags = flags; + + /* trace before insert to be able to see failed inserts */ + trace_xfs_extent_busy(tp->t_mountp, agno, bno, len); + + pag = xfs_perag_get(tp->t_mountp, new->agno); + spin_lock(&pag->pagb_lock); + rbp = &pag->pagb_tree.rb_node; + while (*rbp) { + parent = *rbp; + busyp = rb_entry(parent, struct xfs_extent_busy, rb_node); + + if (new->bno < busyp->bno) { + rbp = &(*rbp)->rb_left; + ASSERT(new->bno + new->length <= busyp->bno); + } else if (new->bno > busyp->bno) { + rbp = &(*rbp)->rb_right; + ASSERT(bno >= busyp->bno + busyp->length); + } else { + ASSERT(0); + } + } + + rb_link_node(&new->rb_node, parent, rbp); + rb_insert_color(&new->rb_node, &pag->pagb_tree); + + list_add(&new->list, &tp->t_busy); + spin_unlock(&pag->pagb_lock); + xfs_perag_put(pag); +} + +/* + * Search for a busy extent within the range of the extent we are about to + * allocate. You need to be holding the busy extent tree lock when calling + * xfs_extent_busy_search(). This function returns 0 for no overlapping busy + * extent, -1 for an overlapping but not exact busy extent, and 1 for an exact + * match. This is done so that a non-zero return indicates an overlap that + * will require a synchronous transaction, but it can still be + * used to distinguish between a partial or exact match. + */ +int +xfs_extent_busy_search( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_agblock_t bno, + xfs_extlen_t len) +{ + struct xfs_perag *pag; + struct rb_node *rbp; + struct xfs_extent_busy *busyp; + int match = 0; + + pag = xfs_perag_get(mp, agno); + spin_lock(&pag->pagb_lock); + + rbp = pag->pagb_tree.rb_node; + + /* find closest start bno overlap */ + while (rbp) { + busyp = rb_entry(rbp, struct xfs_extent_busy, rb_node); + if (bno < busyp->bno) { + /* may overlap, but exact start block is lower */ + if (bno + len > busyp->bno) + match = -1; + rbp = rbp->rb_left; + } else if (bno > busyp->bno) { + /* may overlap, but exact start block is higher */ + if (bno < busyp->bno + busyp->length) + match = -1; + rbp = rbp->rb_right; + } else { + /* bno matches busyp, length determines exact match */ + match = (busyp->length == len) ? 1 : -1; + break; + } + } + spin_unlock(&pag->pagb_lock); + xfs_perag_put(pag); + return match; +} + +/* + * The found free extent [fbno, fend] overlaps part or all of the given busy + * extent. If the overlap covers the beginning, the end, or all of the busy + * extent, the overlapping portion can be made unbusy and used for the + * allocation. We can't split a busy extent because we can't modify a + * transaction/CIL context busy list, but we can update an entry's block + * number or length. + * + * Returns true if the extent can safely be reused, or false if the search + * needs to be restarted. + */ +STATIC bool +xfs_extent_busy_update_extent( + struct xfs_mount *mp, + struct xfs_perag *pag, + struct xfs_extent_busy *busyp, + xfs_agblock_t fbno, + xfs_extlen_t flen, + bool userdata) __releases(&pag->pagb_lock) + __acquires(&pag->pagb_lock) +{ + xfs_agblock_t fend = fbno + flen; + xfs_agblock_t bbno = busyp->bno; + xfs_agblock_t bend = bbno + busyp->length; + + /* + * This extent is currently being discarded. Give the thread + * performing the discard a chance to mark the extent unbusy + * and retry. + */ + if (busyp->flags & XFS_EXTENT_BUSY_DISCARDED) { + spin_unlock(&pag->pagb_lock); + delay(1); + spin_lock(&pag->pagb_lock); + return false; + } + + /* + * If there is a busy extent overlapping a user allocation, we have + * no choice but to force the log and retry the search. + * + * Fortunately this does not happen during normal operation, but + * only if the filesystem is very low on space and has to dip into + * the AGFL for normal allocations. + */ + if (userdata) + goto out_force_log; + + if (bbno < fbno && bend > fend) { + /* + * Case 1: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +---------+ + * fbno fend + */ + + /* + * We would have to split the busy extent to be able to track + * it correct, which we cannot do because we would have to + * modify the list of busy extents attached to the transaction + * or CIL context, which is immutable. + * + * Force out the log to clear the busy extent and retry the + * search. + */ + goto out_force_log; + } else if (bbno >= fbno && bend <= fend) { + /* + * Case 2: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +-----------------+ + * fbno fend + * + * Case 3: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +--------------------------+ + * fbno fend + * + * Case 4: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +--------------------------+ + * fbno fend + * + * Case 5: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +-----------------------------------+ + * fbno fend + * + */ + + /* + * The busy extent is fully covered by the extent we are + * allocating, and can simply be removed from the rbtree. + * However we cannot remove it from the immutable list + * tracking busy extents in the transaction or CIL context, + * so set the length to zero to mark it invalid. + * + * We also need to restart the busy extent search from the + * tree root, because erasing the node can rearrange the + * tree topology. + */ + rb_erase(&busyp->rb_node, &pag->pagb_tree); + busyp->length = 0; + return false; + } else if (fend < bend) { + /* + * Case 6: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +---------+ + * fbno fend + * + * Case 7: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +------------------+ + * fbno fend + * + */ + busyp->bno = fend; + } else if (bbno < fbno) { + /* + * Case 8: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +-------------+ + * fbno fend + * + * Case 9: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +----------------------+ + * fbno fend + */ + busyp->length = fbno - busyp->bno; + } else { + ASSERT(0); + } + + trace_xfs_extent_busy_reuse(mp, pag->pag_agno, fbno, flen); + return true; + +out_force_log: + spin_unlock(&pag->pagb_lock); + xfs_log_force(mp, XFS_LOG_SYNC); + trace_xfs_extent_busy_force(mp, pag->pag_agno, fbno, flen); + spin_lock(&pag->pagb_lock); + return false; +} + + +/* + * For a given extent [fbno, flen], make sure we can reuse it safely. + */ +void +xfs_extent_busy_reuse( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_agblock_t fbno, + xfs_extlen_t flen, + bool userdata) +{ + struct xfs_perag *pag; + struct rb_node *rbp; + + ASSERT(flen > 0); + + pag = xfs_perag_get(mp, agno); + spin_lock(&pag->pagb_lock); +restart: + rbp = pag->pagb_tree.rb_node; + while (rbp) { + struct xfs_extent_busy *busyp = + rb_entry(rbp, struct xfs_extent_busy, rb_node); + xfs_agblock_t bbno = busyp->bno; + xfs_agblock_t bend = bbno + busyp->length; + + if (fbno + flen <= bbno) { + rbp = rbp->rb_left; + continue; + } else if (fbno >= bend) { + rbp = rbp->rb_right; + continue; + } + + if (!xfs_extent_busy_update_extent(mp, pag, busyp, fbno, flen, + userdata)) + goto restart; + } + spin_unlock(&pag->pagb_lock); + xfs_perag_put(pag); +} + +/* + * For a given extent [fbno, flen], search the busy extent list to find a + * subset of the extent that is not busy. If *rlen is smaller than + * args->minlen no suitable extent could be found, and the higher level + * code needs to force out the log and retry the allocation. + */ +void +xfs_extent_busy_trim( + struct xfs_alloc_arg *args, + xfs_agblock_t bno, + xfs_extlen_t len, + xfs_agblock_t *rbno, + xfs_extlen_t *rlen) +{ + xfs_agblock_t fbno; + xfs_extlen_t flen; + struct rb_node *rbp; + + ASSERT(len > 0); + + spin_lock(&args->pag->pagb_lock); +restart: + fbno = bno; + flen = len; + rbp = args->pag->pagb_tree.rb_node; + while (rbp && flen >= args->minlen) { + struct xfs_extent_busy *busyp = + rb_entry(rbp, struct xfs_extent_busy, rb_node); + xfs_agblock_t fend = fbno + flen; + xfs_agblock_t bbno = busyp->bno; + xfs_agblock_t bend = bbno + busyp->length; + + if (fend <= bbno) { + rbp = rbp->rb_left; + continue; + } else if (fbno >= bend) { + rbp = rbp->rb_right; + continue; + } + + /* + * If this is a metadata allocation, try to reuse the busy + * extent instead of trimming the allocation. + */ + if (!args->userdata && + !(busyp->flags & XFS_EXTENT_BUSY_DISCARDED)) { + if (!xfs_extent_busy_update_extent(args->mp, args->pag, + busyp, fbno, flen, + false)) + goto restart; + continue; + } + + if (bbno <= fbno) { + /* start overlap */ + + /* + * Case 1: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +---------+ + * fbno fend + * + * Case 2: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +-------------+ + * fbno fend + * + * Case 3: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +-------------+ + * fbno fend + * + * Case 4: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +-----------------+ + * fbno fend + * + * No unbusy region in extent, return failure. + */ + if (fend <= bend) + goto fail; + + /* + * Case 5: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +----------------------+ + * fbno fend + * + * Case 6: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +--------------------------+ + * fbno fend + * + * Needs to be trimmed to: + * +-------+ + * fbno fend + */ + fbno = bend; + } else if (bend >= fend) { + /* end overlap */ + + /* + * Case 7: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +------------------+ + * fbno fend + * + * Case 8: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +--------------------------+ + * fbno fend + * + * Needs to be trimmed to: + * +-------+ + * fbno fend + */ + fend = bbno; + } else { + /* middle overlap */ + + /* + * Case 9: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +-----------------------------------+ + * fbno fend + * + * Can be trimmed to: + * +-------+ OR +-------+ + * fbno fend fbno fend + * + * Backward allocation leads to significant + * fragmentation of directories, which degrades + * directory performance, therefore we always want to + * choose the option that produces forward allocation + * patterns. + * Preferring the lower bno extent will make the next + * request use "fend" as the start of the next + * allocation; if the segment is no longer busy at + * that point, we'll get a contiguous allocation, but + * even if it is still busy, we will get a forward + * allocation. + * We try to avoid choosing the segment at "bend", + * because that can lead to the next allocation + * taking the segment at "fbno", which would be a + * backward allocation. We only use the segment at + * "fbno" if it is much larger than the current + * requested size, because in that case there's a + * good chance subsequent allocations will be + * contiguous. + */ + if (bbno - fbno >= args->maxlen) { + /* left candidate fits perfect */ + fend = bbno; + } else if (fend - bend >= args->maxlen * 4) { + /* right candidate has enough free space */ + fbno = bend; + } else if (bbno - fbno >= args->minlen) { + /* left candidate fits minimum requirement */ + fend = bbno; + } else { + goto fail; + } + } + + flen = fend - fbno; + } + spin_unlock(&args->pag->pagb_lock); + + if (fbno != bno || flen != len) { + trace_xfs_extent_busy_trim(args->mp, args->agno, bno, len, + fbno, flen); + } + *rbno = fbno; + *rlen = flen; + return; +fail: + /* + * Return a zero extent length as failure indications. All callers + * re-check if the trimmed extent satisfies the minlen requirement. + */ + spin_unlock(&args->pag->pagb_lock); + trace_xfs_extent_busy_trim(args->mp, args->agno, bno, len, fbno, 0); + *rbno = fbno; + *rlen = 0; +} + +STATIC void +xfs_extent_busy_clear_one( + struct xfs_mount *mp, + struct xfs_perag *pag, + struct xfs_extent_busy *busyp) +{ + if (busyp->length) { + trace_xfs_extent_busy_clear(mp, busyp->agno, busyp->bno, + busyp->length); + rb_erase(&busyp->rb_node, &pag->pagb_tree); + } + + list_del_init(&busyp->list); + kmem_free(busyp); +} + +/* + * Remove all extents on the passed in list from the busy extents tree. + * If do_discard is set skip extents that need to be discarded, and mark + * these as undergoing a discard operation instead. + */ +void +xfs_extent_busy_clear( + struct xfs_mount *mp, + struct list_head *list, + bool do_discard) +{ + struct xfs_extent_busy *busyp, *n; + struct xfs_perag *pag = NULL; + xfs_agnumber_t agno = NULLAGNUMBER; + + list_for_each_entry_safe(busyp, n, list, list) { + if (busyp->agno != agno) { + if (pag) { + spin_unlock(&pag->pagb_lock); + xfs_perag_put(pag); + } + pag = xfs_perag_get(mp, busyp->agno); + spin_lock(&pag->pagb_lock); + agno = busyp->agno; + } + + if (do_discard && busyp->length && + !(busyp->flags & XFS_EXTENT_BUSY_SKIP_DISCARD)) + busyp->flags = XFS_EXTENT_BUSY_DISCARDED; + else + xfs_extent_busy_clear_one(mp, pag, busyp); + } + + if (pag) { + spin_unlock(&pag->pagb_lock); + xfs_perag_put(pag); + } +} + +/* + * Callback for list_sort to sort busy extents by the AG they reside in. + */ +int +xfs_extent_busy_ag_cmp( + void *priv, + struct list_head *a, + struct list_head *b) +{ + return container_of(a, struct xfs_extent_busy, list)->agno - + container_of(b, struct xfs_extent_busy, list)->agno; +} diff --git a/kernel/fs/xfs/xfs_extent_busy.h b/kernel/fs/xfs/xfs_extent_busy.h new file mode 100644 index 000000000..bfff284d2 --- /dev/null +++ b/kernel/fs/xfs/xfs_extent_busy.h @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. + * Copyright (c) 2010 David Chinner. + * Copyright (c) 2011 Christoph Hellwig. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_EXTENT_BUSY_H__ +#define __XFS_EXTENT_BUSY_H__ + +struct xfs_mount; +struct xfs_trans; +struct xfs_alloc_arg; + +/* + * Busy block/extent entry. Indexed by a rbtree in perag to mark blocks that + * have been freed but whose transactions aren't committed to disk yet. + * + * Note that we use the transaction ID to record the transaction, not the + * transaction structure itself. See xfs_extent_busy_insert() for details. + */ +struct xfs_extent_busy { + struct rb_node rb_node; /* ag by-bno indexed search tree */ + struct list_head list; /* transaction busy extent list */ + xfs_agnumber_t agno; + xfs_agblock_t bno; + xfs_extlen_t length; + unsigned int flags; +#define XFS_EXTENT_BUSY_DISCARDED 0x01 /* undergoing a discard op. */ +#define XFS_EXTENT_BUSY_SKIP_DISCARD 0x02 /* do not discard */ +}; + +void +xfs_extent_busy_insert(struct xfs_trans *tp, xfs_agnumber_t agno, + xfs_agblock_t bno, xfs_extlen_t len, unsigned int flags); + +void +xfs_extent_busy_clear(struct xfs_mount *mp, struct list_head *list, + bool do_discard); + +int +xfs_extent_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t bno, xfs_extlen_t len); + +void +xfs_extent_busy_reuse(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t fbno, xfs_extlen_t flen, bool userdata); + +void +xfs_extent_busy_trim(struct xfs_alloc_arg *args, xfs_agblock_t bno, + xfs_extlen_t len, xfs_agblock_t *rbno, xfs_extlen_t *rlen); + +int +xfs_extent_busy_ag_cmp(void *priv, struct list_head *a, struct list_head *b); + +static inline void xfs_extent_busy_sort(struct list_head *list) +{ + list_sort(NULL, list, xfs_extent_busy_ag_cmp); +} + +#endif /* __XFS_EXTENT_BUSY_H__ */ diff --git a/kernel/fs/xfs/xfs_extfree_item.c b/kernel/fs/xfs/xfs_extfree_item.c new file mode 100644 index 000000000..cb7fe64cd --- /dev/null +++ b/kernel/fs/xfs/xfs_extfree_item.c @@ -0,0 +1,507 @@ +/* + * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_trans.h" +#include "xfs_trans_priv.h" +#include "xfs_buf_item.h" +#include "xfs_extfree_item.h" +#include "xfs_log.h" + + +kmem_zone_t *xfs_efi_zone; +kmem_zone_t *xfs_efd_zone; + +static inline struct xfs_efi_log_item *EFI_ITEM(struct xfs_log_item *lip) +{ + return container_of(lip, struct xfs_efi_log_item, efi_item); +} + +void +xfs_efi_item_free( + struct xfs_efi_log_item *efip) +{ + if (efip->efi_format.efi_nextents > XFS_EFI_MAX_FAST_EXTENTS) + kmem_free(efip); + else + kmem_zone_free(xfs_efi_zone, efip); +} + +/* + * Freeing the efi requires that we remove it from the AIL if it has already + * been placed there. However, the EFI may not yet have been placed in the AIL + * when called by xfs_efi_release() from EFD processing due to the ordering of + * committed vs unpin operations in bulk insert operations. Hence the reference + * count to ensure only the last caller frees the EFI. + */ +STATIC void +__xfs_efi_release( + struct xfs_efi_log_item *efip) +{ + struct xfs_ail *ailp = efip->efi_item.li_ailp; + + if (atomic_dec_and_test(&efip->efi_refcount)) { + spin_lock(&ailp->xa_lock); + /* xfs_trans_ail_delete() drops the AIL lock. */ + xfs_trans_ail_delete(ailp, &efip->efi_item, + SHUTDOWN_LOG_IO_ERROR); + xfs_efi_item_free(efip); + } +} + +/* + * This returns the number of iovecs needed to log the given efi item. + * We only need 1 iovec for an efi item. It just logs the efi_log_format + * structure. + */ +static inline int +xfs_efi_item_sizeof( + struct xfs_efi_log_item *efip) +{ + return sizeof(struct xfs_efi_log_format) + + (efip->efi_format.efi_nextents - 1) * sizeof(xfs_extent_t); +} + +STATIC void +xfs_efi_item_size( + struct xfs_log_item *lip, + int *nvecs, + int *nbytes) +{ + *nvecs += 1; + *nbytes += xfs_efi_item_sizeof(EFI_ITEM(lip)); +} + +/* + * This is called to fill in the vector of log iovecs for the + * given efi log item. We use only 1 iovec, and we point that + * at the efi_log_format structure embedded in the efi item. + * It is at this point that we assert that all of the extent + * slots in the efi item have been filled. + */ +STATIC void +xfs_efi_item_format( + struct xfs_log_item *lip, + struct xfs_log_vec *lv) +{ + struct xfs_efi_log_item *efip = EFI_ITEM(lip); + struct xfs_log_iovec *vecp = NULL; + + ASSERT(atomic_read(&efip->efi_next_extent) == + efip->efi_format.efi_nextents); + + efip->efi_format.efi_type = XFS_LI_EFI; + efip->efi_format.efi_size = 1; + + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_EFI_FORMAT, + &efip->efi_format, + xfs_efi_item_sizeof(efip)); +} + + +/* + * Pinning has no meaning for an efi item, so just return. + */ +STATIC void +xfs_efi_item_pin( + struct xfs_log_item *lip) +{ +} + +/* + * While EFIs cannot really be pinned, the unpin operation is the last place at + * which the EFI is manipulated during a transaction. If we are being asked to + * remove the EFI it's because the transaction has been cancelled and by + * definition that means the EFI cannot be in the AIL so remove it from the + * transaction and free it. Otherwise coordinate with xfs_efi_release() + * to determine who gets to free the EFI. + */ +STATIC void +xfs_efi_item_unpin( + struct xfs_log_item *lip, + int remove) +{ + struct xfs_efi_log_item *efip = EFI_ITEM(lip); + + if (remove) { + ASSERT(!(lip->li_flags & XFS_LI_IN_AIL)); + if (lip->li_desc) + xfs_trans_del_item(lip); + xfs_efi_item_free(efip); + return; + } + __xfs_efi_release(efip); +} + +/* + * Efi items have no locking or pushing. However, since EFIs are pulled from + * the AIL when their corresponding EFDs are committed to disk, their situation + * is very similar to being pinned. Return XFS_ITEM_PINNED so that the caller + * will eventually flush the log. This should help in getting the EFI out of + * the AIL. + */ +STATIC uint +xfs_efi_item_push( + struct xfs_log_item *lip, + struct list_head *buffer_list) +{ + return XFS_ITEM_PINNED; +} + +STATIC void +xfs_efi_item_unlock( + struct xfs_log_item *lip) +{ + if (lip->li_flags & XFS_LI_ABORTED) + xfs_efi_item_free(EFI_ITEM(lip)); +} + +/* + * The EFI is logged only once and cannot be moved in the log, so simply return + * the lsn at which it's been logged. + */ +STATIC xfs_lsn_t +xfs_efi_item_committed( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ + return lsn; +} + +/* + * The EFI dependency tracking op doesn't do squat. It can't because + * it doesn't know where the free extent is coming from. The dependency + * tracking has to be handled by the "enclosing" metadata object. For + * example, for inodes, the inode is locked throughout the extent freeing + * so the dependency should be recorded there. + */ +STATIC void +xfs_efi_item_committing( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ +} + +/* + * This is the ops vector shared by all efi log items. + */ +static const struct xfs_item_ops xfs_efi_item_ops = { + .iop_size = xfs_efi_item_size, + .iop_format = xfs_efi_item_format, + .iop_pin = xfs_efi_item_pin, + .iop_unpin = xfs_efi_item_unpin, + .iop_unlock = xfs_efi_item_unlock, + .iop_committed = xfs_efi_item_committed, + .iop_push = xfs_efi_item_push, + .iop_committing = xfs_efi_item_committing +}; + + +/* + * Allocate and initialize an efi item with the given number of extents. + */ +struct xfs_efi_log_item * +xfs_efi_init( + struct xfs_mount *mp, + uint nextents) + +{ + struct xfs_efi_log_item *efip; + uint size; + + ASSERT(nextents > 0); + if (nextents > XFS_EFI_MAX_FAST_EXTENTS) { + size = (uint)(sizeof(xfs_efi_log_item_t) + + ((nextents - 1) * sizeof(xfs_extent_t))); + efip = kmem_zalloc(size, KM_SLEEP); + } else { + efip = kmem_zone_zalloc(xfs_efi_zone, KM_SLEEP); + } + + xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops); + efip->efi_format.efi_nextents = nextents; + efip->efi_format.efi_id = (__psint_t)(void*)efip; + atomic_set(&efip->efi_next_extent, 0); + atomic_set(&efip->efi_refcount, 2); + + return efip; +} + +/* + * Copy an EFI format buffer from the given buf, and into the destination + * EFI format structure. + * The given buffer can be in 32 bit or 64 bit form (which has different padding), + * one of which will be the native format for this kernel. + * It will handle the conversion of formats if necessary. + */ +int +xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt) +{ + xfs_efi_log_format_t *src_efi_fmt = buf->i_addr; + uint i; + uint len = sizeof(xfs_efi_log_format_t) + + (src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_t); + uint len32 = sizeof(xfs_efi_log_format_32_t) + + (src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_32_t); + uint len64 = sizeof(xfs_efi_log_format_64_t) + + (src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_64_t); + + if (buf->i_len == len) { + memcpy((char *)dst_efi_fmt, (char*)src_efi_fmt, len); + return 0; + } else if (buf->i_len == len32) { + xfs_efi_log_format_32_t *src_efi_fmt_32 = buf->i_addr; + + dst_efi_fmt->efi_type = src_efi_fmt_32->efi_type; + dst_efi_fmt->efi_size = src_efi_fmt_32->efi_size; + dst_efi_fmt->efi_nextents = src_efi_fmt_32->efi_nextents; + dst_efi_fmt->efi_id = src_efi_fmt_32->efi_id; + for (i = 0; i < dst_efi_fmt->efi_nextents; i++) { + dst_efi_fmt->efi_extents[i].ext_start = + src_efi_fmt_32->efi_extents[i].ext_start; + dst_efi_fmt->efi_extents[i].ext_len = + src_efi_fmt_32->efi_extents[i].ext_len; + } + return 0; + } else if (buf->i_len == len64) { + xfs_efi_log_format_64_t *src_efi_fmt_64 = buf->i_addr; + + dst_efi_fmt->efi_type = src_efi_fmt_64->efi_type; + dst_efi_fmt->efi_size = src_efi_fmt_64->efi_size; + dst_efi_fmt->efi_nextents = src_efi_fmt_64->efi_nextents; + dst_efi_fmt->efi_id = src_efi_fmt_64->efi_id; + for (i = 0; i < dst_efi_fmt->efi_nextents; i++) { + dst_efi_fmt->efi_extents[i].ext_start = + src_efi_fmt_64->efi_extents[i].ext_start; + dst_efi_fmt->efi_extents[i].ext_len = + src_efi_fmt_64->efi_extents[i].ext_len; + } + return 0; + } + return -EFSCORRUPTED; +} + +/* + * This is called by the efd item code below to release references to the given + * efi item. Each efd calls this with the number of extents that it has + * logged, and when the sum of these reaches the total number of extents logged + * by this efi item we can free the efi item. + */ +void +xfs_efi_release(xfs_efi_log_item_t *efip, + uint nextents) +{ + ASSERT(atomic_read(&efip->efi_next_extent) >= nextents); + if (atomic_sub_and_test(nextents, &efip->efi_next_extent)) { + /* recovery needs us to drop the EFI reference, too */ + if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)) + __xfs_efi_release(efip); + + __xfs_efi_release(efip); + /* efip may now have been freed, do not reference it again. */ + } +} + +static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip) +{ + return container_of(lip, struct xfs_efd_log_item, efd_item); +} + +STATIC void +xfs_efd_item_free(struct xfs_efd_log_item *efdp) +{ + if (efdp->efd_format.efd_nextents > XFS_EFD_MAX_FAST_EXTENTS) + kmem_free(efdp); + else + kmem_zone_free(xfs_efd_zone, efdp); +} + +/* + * This returns the number of iovecs needed to log the given efd item. + * We only need 1 iovec for an efd item. It just logs the efd_log_format + * structure. + */ +static inline int +xfs_efd_item_sizeof( + struct xfs_efd_log_item *efdp) +{ + return sizeof(xfs_efd_log_format_t) + + (efdp->efd_format.efd_nextents - 1) * sizeof(xfs_extent_t); +} + +STATIC void +xfs_efd_item_size( + struct xfs_log_item *lip, + int *nvecs, + int *nbytes) +{ + *nvecs += 1; + *nbytes += xfs_efd_item_sizeof(EFD_ITEM(lip)); +} + +/* + * This is called to fill in the vector of log iovecs for the + * given efd log item. We use only 1 iovec, and we point that + * at the efd_log_format structure embedded in the efd item. + * It is at this point that we assert that all of the extent + * slots in the efd item have been filled. + */ +STATIC void +xfs_efd_item_format( + struct xfs_log_item *lip, + struct xfs_log_vec *lv) +{ + struct xfs_efd_log_item *efdp = EFD_ITEM(lip); + struct xfs_log_iovec *vecp = NULL; + + ASSERT(efdp->efd_next_extent == efdp->efd_format.efd_nextents); + + efdp->efd_format.efd_type = XFS_LI_EFD; + efdp->efd_format.efd_size = 1; + + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_EFD_FORMAT, + &efdp->efd_format, + xfs_efd_item_sizeof(efdp)); +} + +/* + * Pinning has no meaning for an efd item, so just return. + */ +STATIC void +xfs_efd_item_pin( + struct xfs_log_item *lip) +{ +} + +/* + * Since pinning has no meaning for an efd item, unpinning does + * not either. + */ +STATIC void +xfs_efd_item_unpin( + struct xfs_log_item *lip, + int remove) +{ +} + +/* + * There isn't much you can do to push on an efd item. It is simply stuck + * waiting for the log to be flushed to disk. + */ +STATIC uint +xfs_efd_item_push( + struct xfs_log_item *lip, + struct list_head *buffer_list) +{ + return XFS_ITEM_PINNED; +} + +STATIC void +xfs_efd_item_unlock( + struct xfs_log_item *lip) +{ + if (lip->li_flags & XFS_LI_ABORTED) + xfs_efd_item_free(EFD_ITEM(lip)); +} + +/* + * When the efd item is committed to disk, all we need to do + * is delete our reference to our partner efi item and then + * free ourselves. Since we're freeing ourselves we must + * return -1 to keep the transaction code from further referencing + * this item. + */ +STATIC xfs_lsn_t +xfs_efd_item_committed( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ + struct xfs_efd_log_item *efdp = EFD_ITEM(lip); + + /* + * If we got a log I/O error, it's always the case that the LR with the + * EFI got unpinned and freed before the EFD got aborted. + */ + if (!(lip->li_flags & XFS_LI_ABORTED)) + xfs_efi_release(efdp->efd_efip, efdp->efd_format.efd_nextents); + + xfs_efd_item_free(efdp); + return (xfs_lsn_t)-1; +} + +/* + * The EFD dependency tracking op doesn't do squat. It can't because + * it doesn't know where the free extent is coming from. The dependency + * tracking has to be handled by the "enclosing" metadata object. For + * example, for inodes, the inode is locked throughout the extent freeing + * so the dependency should be recorded there. + */ +STATIC void +xfs_efd_item_committing( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ +} + +/* + * This is the ops vector shared by all efd log items. + */ +static const struct xfs_item_ops xfs_efd_item_ops = { + .iop_size = xfs_efd_item_size, + .iop_format = xfs_efd_item_format, + .iop_pin = xfs_efd_item_pin, + .iop_unpin = xfs_efd_item_unpin, + .iop_unlock = xfs_efd_item_unlock, + .iop_committed = xfs_efd_item_committed, + .iop_push = xfs_efd_item_push, + .iop_committing = xfs_efd_item_committing +}; + +/* + * Allocate and initialize an efd item with the given number of extents. + */ +struct xfs_efd_log_item * +xfs_efd_init( + struct xfs_mount *mp, + struct xfs_efi_log_item *efip, + uint nextents) + +{ + struct xfs_efd_log_item *efdp; + uint size; + + ASSERT(nextents > 0); + if (nextents > XFS_EFD_MAX_FAST_EXTENTS) { + size = (uint)(sizeof(xfs_efd_log_item_t) + + ((nextents - 1) * sizeof(xfs_extent_t))); + efdp = kmem_zalloc(size, KM_SLEEP); + } else { + efdp = kmem_zone_zalloc(xfs_efd_zone, KM_SLEEP); + } + + xfs_log_item_init(mp, &efdp->efd_item, XFS_LI_EFD, &xfs_efd_item_ops); + efdp->efd_efip = efip; + efdp->efd_format.efd_nextents = nextents; + efdp->efd_format.efd_efi_id = efip->efi_format.efi_id; + + return efdp; +} diff --git a/kernel/fs/xfs/xfs_extfree_item.h b/kernel/fs/xfs/xfs_extfree_item.h new file mode 100644 index 000000000..0ffbce32d --- /dev/null +++ b/kernel/fs/xfs/xfs_extfree_item.h @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2000,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_EXTFREE_ITEM_H__ +#define __XFS_EXTFREE_ITEM_H__ + +/* kernel only EFI/EFD definitions */ + +struct xfs_mount; +struct kmem_zone; + +/* + * Max number of extents in fast allocation path. + */ +#define XFS_EFI_MAX_FAST_EXTENTS 16 + +/* + * Define EFI flag bits. Manipulated by set/clear/test_bit operators. + */ +#define XFS_EFI_RECOVERED 1 + +/* + * This is the "extent free intention" log item. It is used to log the fact + * that some extents need to be free. It is used in conjunction with the + * "extent free done" log item described below. + * + * The EFI is reference counted so that it is not freed prior to both the EFI + * and EFD being committed and unpinned. This ensures that when the last + * reference goes away the EFI will always be in the AIL as it has been + * unpinned, regardless of whether the EFD is processed before or after the EFI. + */ +typedef struct xfs_efi_log_item { + xfs_log_item_t efi_item; + atomic_t efi_refcount; + atomic_t efi_next_extent; + unsigned long efi_flags; /* misc flags */ + xfs_efi_log_format_t efi_format; +} xfs_efi_log_item_t; + +/* + * This is the "extent free done" log item. It is used to log + * the fact that some extents earlier mentioned in an efi item + * have been freed. + */ +typedef struct xfs_efd_log_item { + xfs_log_item_t efd_item; + xfs_efi_log_item_t *efd_efip; + uint efd_next_extent; + xfs_efd_log_format_t efd_format; +} xfs_efd_log_item_t; + +/* + * Max number of extents in fast allocation path. + */ +#define XFS_EFD_MAX_FAST_EXTENTS 16 + +extern struct kmem_zone *xfs_efi_zone; +extern struct kmem_zone *xfs_efd_zone; + +xfs_efi_log_item_t *xfs_efi_init(struct xfs_mount *, uint); +xfs_efd_log_item_t *xfs_efd_init(struct xfs_mount *, xfs_efi_log_item_t *, + uint); +int xfs_efi_copy_format(xfs_log_iovec_t *buf, + xfs_efi_log_format_t *dst_efi_fmt); +void xfs_efi_item_free(xfs_efi_log_item_t *); + +#endif /* __XFS_EXTFREE_ITEM_H__ */ diff --git a/kernel/fs/xfs/xfs_file.c b/kernel/fs/xfs/xfs_file.c new file mode 100644 index 000000000..3b7591224 --- /dev/null +++ b/kernel/fs/xfs/xfs_file.c @@ -0,0 +1,1534 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_inode_item.h" +#include "xfs_bmap.h" +#include "xfs_bmap_util.h" +#include "xfs_error.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" +#include "xfs_ioctl.h" +#include "xfs_trace.h" +#include "xfs_log.h" +#include "xfs_icache.h" +#include "xfs_pnfs.h" + +#include +#include +#include + +static const struct vm_operations_struct xfs_file_vm_ops; + +/* + * Locking primitives for read and write IO paths to ensure we consistently use + * and order the inode->i_mutex, ip->i_lock and ip->i_iolock. + */ +static inline void +xfs_rw_ilock( + struct xfs_inode *ip, + int type) +{ + if (type & XFS_IOLOCK_EXCL) + mutex_lock(&VFS_I(ip)->i_mutex); + xfs_ilock(ip, type); +} + +static inline void +xfs_rw_iunlock( + struct xfs_inode *ip, + int type) +{ + xfs_iunlock(ip, type); + if (type & XFS_IOLOCK_EXCL) + mutex_unlock(&VFS_I(ip)->i_mutex); +} + +static inline void +xfs_rw_ilock_demote( + struct xfs_inode *ip, + int type) +{ + xfs_ilock_demote(ip, type); + if (type & XFS_IOLOCK_EXCL) + mutex_unlock(&VFS_I(ip)->i_mutex); +} + +/* + * xfs_iozero + * + * xfs_iozero clears the specified range of buffer supplied, + * and marks all the affected blocks as valid and modified. If + * an affected block is not allocated, it will be allocated. If + * an affected block is not completely overwritten, and is not + * valid before the operation, it will be read from disk before + * being partially zeroed. + */ +int +xfs_iozero( + struct xfs_inode *ip, /* inode */ + loff_t pos, /* offset in file */ + size_t count) /* size of data to zero */ +{ + struct page *page; + struct address_space *mapping; + int status; + + mapping = VFS_I(ip)->i_mapping; + do { + unsigned offset, bytes; + void *fsdata; + + offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ + bytes = PAGE_CACHE_SIZE - offset; + if (bytes > count) + bytes = count; + + status = pagecache_write_begin(NULL, mapping, pos, bytes, + AOP_FLAG_UNINTERRUPTIBLE, + &page, &fsdata); + if (status) + break; + + zero_user(page, offset, bytes); + + status = pagecache_write_end(NULL, mapping, pos, bytes, bytes, + page, fsdata); + WARN_ON(status <= 0); /* can't return less than zero! */ + pos += bytes; + count -= bytes; + status = 0; + } while (count); + + return status; +} + +int +xfs_update_prealloc_flags( + struct xfs_inode *ip, + enum xfs_prealloc_flags flags) +{ + struct xfs_trans *tp; + int error; + + tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_WRITEID); + error = xfs_trans_reserve(tp, &M_RES(ip->i_mount)->tr_writeid, 0, 0); + if (error) { + xfs_trans_cancel(tp, 0); + return error; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + + if (!(flags & XFS_PREALLOC_INVISIBLE)) { + ip->i_d.di_mode &= ~S_ISUID; + if (ip->i_d.di_mode & S_IXGRP) + ip->i_d.di_mode &= ~S_ISGID; + xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + } + + if (flags & XFS_PREALLOC_SET) + ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC; + if (flags & XFS_PREALLOC_CLEAR) + ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC; + + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + if (flags & XFS_PREALLOC_SYNC) + xfs_trans_set_sync(tp); + return xfs_trans_commit(tp, 0); +} + +/* + * Fsync operations on directories are much simpler than on regular files, + * as there is no file data to flush, and thus also no need for explicit + * cache flush operations, and there are no non-transaction metadata updates + * on directories either. + */ +STATIC int +xfs_dir_fsync( + struct file *file, + loff_t start, + loff_t end, + int datasync) +{ + struct xfs_inode *ip = XFS_I(file->f_mapping->host); + struct xfs_mount *mp = ip->i_mount; + xfs_lsn_t lsn = 0; + + trace_xfs_dir_fsync(ip); + + xfs_ilock(ip, XFS_ILOCK_SHARED); + if (xfs_ipincount(ip)) + lsn = ip->i_itemp->ili_last_lsn; + xfs_iunlock(ip, XFS_ILOCK_SHARED); + + if (!lsn) + return 0; + return _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL); +} + +STATIC int +xfs_file_fsync( + struct file *file, + loff_t start, + loff_t end, + int datasync) +{ + struct inode *inode = file->f_mapping->host; + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + int error = 0; + int log_flushed = 0; + xfs_lsn_t lsn = 0; + + trace_xfs_file_fsync(ip); + + error = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (error) + return error; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + xfs_iflags_clear(ip, XFS_ITRUNCATED); + + if (mp->m_flags & XFS_MOUNT_BARRIER) { + /* + * If we have an RT and/or log subvolume we need to make sure + * to flush the write cache the device used for file data + * first. This is to ensure newly written file data make + * it to disk before logging the new inode size in case of + * an extending write. + */ + if (XFS_IS_REALTIME_INODE(ip)) + xfs_blkdev_issue_flush(mp->m_rtdev_targp); + else if (mp->m_logdev_targp != mp->m_ddev_targp) + xfs_blkdev_issue_flush(mp->m_ddev_targp); + } + + /* + * All metadata updates are logged, which means that we just have + * to flush the log up to the latest LSN that touched the inode. + */ + xfs_ilock(ip, XFS_ILOCK_SHARED); + if (xfs_ipincount(ip)) { + if (!datasync || + (ip->i_itemp->ili_fields & ~XFS_ILOG_TIMESTAMP)) + lsn = ip->i_itemp->ili_last_lsn; + } + xfs_iunlock(ip, XFS_ILOCK_SHARED); + + if (lsn) + error = _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed); + + /* + * If we only have a single device, and the log force about was + * a no-op we might have to flush the data device cache here. + * This can only happen for fdatasync/O_DSYNC if we were overwriting + * an already allocated file and thus do not have any metadata to + * commit. + */ + if ((mp->m_flags & XFS_MOUNT_BARRIER) && + mp->m_logdev_targp == mp->m_ddev_targp && + !XFS_IS_REALTIME_INODE(ip) && + !log_flushed) + xfs_blkdev_issue_flush(mp->m_ddev_targp); + + return error; +} + +STATIC ssize_t +xfs_file_read_iter( + struct kiocb *iocb, + struct iov_iter *to) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + size_t size = iov_iter_count(to); + ssize_t ret = 0; + int ioflags = 0; + xfs_fsize_t n; + loff_t pos = iocb->ki_pos; + + XFS_STATS_INC(xs_read_calls); + + if (unlikely(iocb->ki_flags & IOCB_DIRECT)) + ioflags |= XFS_IO_ISDIRECT; + if (file->f_mode & FMODE_NOCMTIME) + ioflags |= XFS_IO_INVIS; + + if (unlikely(ioflags & XFS_IO_ISDIRECT)) { + xfs_buftarg_t *target = + XFS_IS_REALTIME_INODE(ip) ? + mp->m_rtdev_targp : mp->m_ddev_targp; + /* DIO must be aligned to device logical sector size */ + if ((pos | size) & target->bt_logical_sectormask) { + if (pos == i_size_read(inode)) + return 0; + return -EINVAL; + } + } + + n = mp->m_super->s_maxbytes - pos; + if (n <= 0 || size == 0) + return 0; + + if (n < size) + size = n; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + /* + * Locking is a bit tricky here. If we take an exclusive lock + * for direct IO, we effectively serialise all new concurrent + * read IO to this file and block it behind IO that is currently in + * progress because IO in progress holds the IO lock shared. We only + * need to hold the lock exclusive to blow away the page cache, so + * only take lock exclusively if the page cache needs invalidation. + * This allows the normal direct IO case of no page cache pages to + * proceeed concurrently without serialisation. + */ + xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); + if ((ioflags & XFS_IO_ISDIRECT) && inode->i_mapping->nrpages) { + xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); + xfs_rw_ilock(ip, XFS_IOLOCK_EXCL); + + if (inode->i_mapping->nrpages) { + ret = filemap_write_and_wait_range( + VFS_I(ip)->i_mapping, + pos, pos + size - 1); + if (ret) { + xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL); + return ret; + } + + /* + * Invalidate whole pages. This can return an error if + * we fail to invalidate a page, but this should never + * happen on XFS. Warn if it does fail. + */ + ret = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping, + pos >> PAGE_CACHE_SHIFT, + (pos + size - 1) >> PAGE_CACHE_SHIFT); + WARN_ON_ONCE(ret); + ret = 0; + } + xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); + } + + trace_xfs_file_read(ip, size, pos, ioflags); + + ret = generic_file_read_iter(iocb, to); + if (ret > 0) + XFS_STATS_ADD(xs_read_bytes, ret); + + xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); + return ret; +} + +STATIC ssize_t +xfs_file_splice_read( + struct file *infilp, + loff_t *ppos, + struct pipe_inode_info *pipe, + size_t count, + unsigned int flags) +{ + struct xfs_inode *ip = XFS_I(infilp->f_mapping->host); + int ioflags = 0; + ssize_t ret; + + XFS_STATS_INC(xs_read_calls); + + if (infilp->f_mode & FMODE_NOCMTIME) + ioflags |= XFS_IO_INVIS; + + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + return -EIO; + + xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); + + trace_xfs_file_splice_read(ip, count, *ppos, ioflags); + + ret = generic_file_splice_read(infilp, ppos, pipe, count, flags); + if (ret > 0) + XFS_STATS_ADD(xs_read_bytes, ret); + + xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); + return ret; +} + +/* + * This routine is called to handle zeroing any space in the last block of the + * file that is beyond the EOF. We do this since the size is being increased + * without writing anything to that block and we don't want to read the + * garbage on the disk. + */ +STATIC int /* error (positive) */ +xfs_zero_last_block( + struct xfs_inode *ip, + xfs_fsize_t offset, + xfs_fsize_t isize, + bool *did_zeroing) +{ + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t last_fsb = XFS_B_TO_FSBT(mp, isize); + int zero_offset = XFS_B_FSB_OFFSET(mp, isize); + int zero_len; + int nimaps = 1; + int error = 0; + struct xfs_bmbt_irec imap; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_bmapi_read(ip, last_fsb, 1, &imap, &nimaps, 0); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + if (error) + return error; + + ASSERT(nimaps > 0); + + /* + * If the block underlying isize is just a hole, then there + * is nothing to zero. + */ + if (imap.br_startblock == HOLESTARTBLOCK) + return 0; + + zero_len = mp->m_sb.sb_blocksize - zero_offset; + if (isize + zero_len > offset) + zero_len = offset - isize; + *did_zeroing = true; + return xfs_iozero(ip, isize, zero_len); +} + +/* + * Zero any on disk space between the current EOF and the new, larger EOF. + * + * This handles the normal case of zeroing the remainder of the last block in + * the file and the unusual case of zeroing blocks out beyond the size of the + * file. This second case only happens with fixed size extents and when the + * system crashes before the inode size was updated but after blocks were + * allocated. + * + * Expects the iolock to be held exclusive, and will take the ilock internally. + */ +int /* error (positive) */ +xfs_zero_eof( + struct xfs_inode *ip, + xfs_off_t offset, /* starting I/O offset */ + xfs_fsize_t isize, /* current inode size */ + bool *did_zeroing) +{ + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t start_zero_fsb; + xfs_fileoff_t end_zero_fsb; + xfs_fileoff_t zero_count_fsb; + xfs_fileoff_t last_fsb; + xfs_fileoff_t zero_off; + xfs_fsize_t zero_len; + int nimaps; + int error = 0; + struct xfs_bmbt_irec imap; + + ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); + ASSERT(offset > isize); + + /* + * First handle zeroing the block on which isize resides. + * + * We only zero a part of that block so it is handled specially. + */ + if (XFS_B_FSB_OFFSET(mp, isize) != 0) { + error = xfs_zero_last_block(ip, offset, isize, did_zeroing); + if (error) + return error; + } + + /* + * Calculate the range between the new size and the old where blocks + * needing to be zeroed may exist. + * + * To get the block where the last byte in the file currently resides, + * we need to subtract one from the size and truncate back to a block + * boundary. We subtract 1 in case the size is exactly on a block + * boundary. + */ + last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1; + start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize); + end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1); + ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb); + if (last_fsb == end_zero_fsb) { + /* + * The size was only incremented on its last block. + * We took care of that above, so just return. + */ + return 0; + } + + ASSERT(start_zero_fsb <= end_zero_fsb); + while (start_zero_fsb <= end_zero_fsb) { + nimaps = 1; + zero_count_fsb = end_zero_fsb - start_zero_fsb + 1; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_bmapi_read(ip, start_zero_fsb, zero_count_fsb, + &imap, &nimaps, 0); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + if (error) + return error; + + ASSERT(nimaps > 0); + + if (imap.br_state == XFS_EXT_UNWRITTEN || + imap.br_startblock == HOLESTARTBLOCK) { + start_zero_fsb = imap.br_startoff + imap.br_blockcount; + ASSERT(start_zero_fsb <= (end_zero_fsb + 1)); + continue; + } + + /* + * There are blocks we need to zero. + */ + zero_off = XFS_FSB_TO_B(mp, start_zero_fsb); + zero_len = XFS_FSB_TO_B(mp, imap.br_blockcount); + + if ((zero_off + zero_len) > offset) + zero_len = offset - zero_off; + + error = xfs_iozero(ip, zero_off, zero_len); + if (error) + return error; + + *did_zeroing = true; + start_zero_fsb = imap.br_startoff + imap.br_blockcount; + ASSERT(start_zero_fsb <= (end_zero_fsb + 1)); + } + + return 0; +} + +/* + * Common pre-write limit and setup checks. + * + * Called with the iolocked held either shared and exclusive according to + * @iolock, and returns with it held. Might upgrade the iolock to exclusive + * if called for a direct write beyond i_size. + */ +STATIC ssize_t +xfs_file_aio_write_checks( + struct kiocb *iocb, + struct iov_iter *from, + int *iolock) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + struct xfs_inode *ip = XFS_I(inode); + ssize_t error = 0; + size_t count = iov_iter_count(from); + +restart: + error = generic_write_checks(iocb, from); + if (error <= 0) + return error; + + error = xfs_break_layouts(inode, iolock, true); + if (error) + return error; + + /* + * If the offset is beyond the size of the file, we need to zero any + * blocks that fall between the existing EOF and the start of this + * write. If zeroing is needed and we are currently holding the + * iolock shared, we need to update it to exclusive which implies + * having to redo all checks before. + * + * We need to serialise against EOF updates that occur in IO + * completions here. We want to make sure that nobody is changing the + * size while we do this check until we have placed an IO barrier (i.e. + * hold the XFS_IOLOCK_EXCL) that prevents new IO from being dispatched. + * The spinlock effectively forms a memory barrier once we have the + * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value + * and hence be able to correctly determine if we need to run zeroing. + */ + spin_lock(&ip->i_flags_lock); + if (iocb->ki_pos > i_size_read(inode)) { + bool zero = false; + + spin_unlock(&ip->i_flags_lock); + if (*iolock == XFS_IOLOCK_SHARED) { + xfs_rw_iunlock(ip, *iolock); + *iolock = XFS_IOLOCK_EXCL; + xfs_rw_ilock(ip, *iolock); + iov_iter_reexpand(from, count); + + /* + * We now have an IO submission barrier in place, but + * AIO can do EOF updates during IO completion and hence + * we now need to wait for all of them to drain. Non-AIO + * DIO will have drained before we are given the + * XFS_IOLOCK_EXCL, and so for most cases this wait is a + * no-op. + */ + inode_dio_wait(inode); + goto restart; + } + error = xfs_zero_eof(ip, iocb->ki_pos, i_size_read(inode), &zero); + if (error) + return error; + } else + spin_unlock(&ip->i_flags_lock); + + /* + * Updating the timestamps will grab the ilock again from + * xfs_fs_dirty_inode, so we have to call it after dropping the + * lock above. Eventually we should look into a way to avoid + * the pointless lock roundtrip. + */ + if (likely(!(file->f_mode & FMODE_NOCMTIME))) { + error = file_update_time(file); + if (error) + return error; + } + + /* + * If we're writing the file then make sure to clear the setuid and + * setgid bits if the process is not being run by root. This keeps + * people from modifying setuid and setgid binaries. + */ + return file_remove_suid(file); +} + +/* + * xfs_file_dio_aio_write - handle direct IO writes + * + * Lock the inode appropriately to prepare for and issue a direct IO write. + * By separating it from the buffered write path we remove all the tricky to + * follow locking changes and looping. + * + * If there are cached pages or we're extending the file, we need IOLOCK_EXCL + * until we're sure the bytes at the new EOF have been zeroed and/or the cached + * pages are flushed out. + * + * In most cases the direct IO writes will be done holding IOLOCK_SHARED + * allowing them to be done in parallel with reads and other direct IO writes. + * However, if the IO is not aligned to filesystem blocks, the direct IO layer + * needs to do sub-block zeroing and that requires serialisation against other + * direct IOs to the same block. In this case we need to serialise the + * submission of the unaligned IOs so that we don't get racing block zeroing in + * the dio layer. To avoid the problem with aio, we also need to wait for + * outstanding IOs to complete so that unwritten extent conversion is completed + * before we try to map the overlapping block. This is currently implemented by + * hitting it with a big hammer (i.e. inode_dio_wait()). + * + * Returns with locks held indicated by @iolock and errors indicated by + * negative return values. + */ +STATIC ssize_t +xfs_file_dio_aio_write( + struct kiocb *iocb, + struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + ssize_t ret = 0; + int unaligned_io = 0; + int iolock; + size_t count = iov_iter_count(from); + loff_t pos = iocb->ki_pos; + loff_t end; + struct iov_iter data; + struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ? + mp->m_rtdev_targp : mp->m_ddev_targp; + + /* DIO must be aligned to device logical sector size */ + if ((pos | count) & target->bt_logical_sectormask) + return -EINVAL; + + /* "unaligned" here means not aligned to a filesystem block */ + if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask)) + unaligned_io = 1; + + /* + * We don't need to take an exclusive lock unless there page cache needs + * to be invalidated or unaligned IO is being executed. We don't need to + * consider the EOF extension case here because + * xfs_file_aio_write_checks() will relock the inode as necessary for + * EOF zeroing cases and fill out the new inode size as appropriate. + */ + if (unaligned_io || mapping->nrpages) + iolock = XFS_IOLOCK_EXCL; + else + iolock = XFS_IOLOCK_SHARED; + xfs_rw_ilock(ip, iolock); + + /* + * Recheck if there are cached pages that need invalidate after we got + * the iolock to protect against other threads adding new pages while + * we were waiting for the iolock. + */ + if (mapping->nrpages && iolock == XFS_IOLOCK_SHARED) { + xfs_rw_iunlock(ip, iolock); + iolock = XFS_IOLOCK_EXCL; + xfs_rw_ilock(ip, iolock); + } + + ret = xfs_file_aio_write_checks(iocb, from, &iolock); + if (ret) + goto out; + count = iov_iter_count(from); + pos = iocb->ki_pos; + end = pos + count - 1; + + if (mapping->nrpages) { + ret = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, + pos, end); + if (ret) + goto out; + /* + * Invalidate whole pages. This can return an error if + * we fail to invalidate a page, but this should never + * happen on XFS. Warn if it does fail. + */ + ret = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping, + pos >> PAGE_CACHE_SHIFT, + end >> PAGE_CACHE_SHIFT); + WARN_ON_ONCE(ret); + ret = 0; + } + + /* + * If we are doing unaligned IO, wait for all other IO to drain, + * otherwise demote the lock if we had to flush cached pages + */ + if (unaligned_io) + inode_dio_wait(inode); + else if (iolock == XFS_IOLOCK_EXCL) { + xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); + iolock = XFS_IOLOCK_SHARED; + } + + trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0); + + data = *from; + ret = mapping->a_ops->direct_IO(iocb, &data, pos); + + /* see generic_file_direct_write() for why this is necessary */ + if (mapping->nrpages) { + invalidate_inode_pages2_range(mapping, + pos >> PAGE_CACHE_SHIFT, + end >> PAGE_CACHE_SHIFT); + } + + if (ret > 0) { + pos += ret; + iov_iter_advance(from, ret); + iocb->ki_pos = pos; + } +out: + xfs_rw_iunlock(ip, iolock); + + /* No fallback to buffered IO on errors for XFS. */ + ASSERT(ret < 0 || ret == count); + return ret; +} + +STATIC ssize_t +xfs_file_buffered_aio_write( + struct kiocb *iocb, + struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + struct xfs_inode *ip = XFS_I(inode); + ssize_t ret; + int enospc = 0; + int iolock = XFS_IOLOCK_EXCL; + + xfs_rw_ilock(ip, iolock); + + ret = xfs_file_aio_write_checks(iocb, from, &iolock); + if (ret) + goto out; + + /* We can write back this queue in page reclaim */ + current->backing_dev_info = inode_to_bdi(inode); + +write_retry: + trace_xfs_file_buffered_write(ip, iov_iter_count(from), + iocb->ki_pos, 0); + ret = generic_perform_write(file, from, iocb->ki_pos); + if (likely(ret >= 0)) + iocb->ki_pos += ret; + + /* + * If we hit a space limit, try to free up some lingering preallocated + * space before returning an error. In the case of ENOSPC, first try to + * write back all dirty inodes to free up some of the excess reserved + * metadata space. This reduces the chances that the eofblocks scan + * waits on dirty mappings. Since xfs_flush_inodes() is serialized, this + * also behaves as a filter to prevent too many eofblocks scans from + * running at the same time. + */ + if (ret == -EDQUOT && !enospc) { + enospc = xfs_inode_free_quota_eofblocks(ip); + if (enospc) + goto write_retry; + } else if (ret == -ENOSPC && !enospc) { + struct xfs_eofblocks eofb = {0}; + + enospc = 1; + xfs_flush_inodes(ip->i_mount); + eofb.eof_scan_owner = ip->i_ino; /* for locking */ + eofb.eof_flags = XFS_EOF_FLAGS_SYNC; + xfs_icache_free_eofblocks(ip->i_mount, &eofb); + goto write_retry; + } + + current->backing_dev_info = NULL; +out: + xfs_rw_iunlock(ip, iolock); + return ret; +} + +STATIC ssize_t +xfs_file_write_iter( + struct kiocb *iocb, + struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + struct xfs_inode *ip = XFS_I(inode); + ssize_t ret; + size_t ocount = iov_iter_count(from); + + XFS_STATS_INC(xs_write_calls); + + if (ocount == 0) + return 0; + + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + return -EIO; + + if (unlikely(iocb->ki_flags & IOCB_DIRECT)) + ret = xfs_file_dio_aio_write(iocb, from); + else + ret = xfs_file_buffered_aio_write(iocb, from); + + if (ret > 0) { + ssize_t err; + + XFS_STATS_ADD(xs_write_bytes, ret); + + /* Handle various SYNC-type writes */ + err = generic_write_sync(file, iocb->ki_pos - ret, ret); + if (err < 0) + ret = err; + } + return ret; +} + +#define XFS_FALLOC_FL_SUPPORTED \ + (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \ + FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | \ + FALLOC_FL_INSERT_RANGE) + +STATIC long +xfs_file_fallocate( + struct file *file, + int mode, + loff_t offset, + loff_t len) +{ + struct inode *inode = file_inode(file); + struct xfs_inode *ip = XFS_I(inode); + long error; + enum xfs_prealloc_flags flags = 0; + uint iolock = XFS_IOLOCK_EXCL; + loff_t new_size = 0; + bool do_file_insert = 0; + + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + if (mode & ~XFS_FALLOC_FL_SUPPORTED) + return -EOPNOTSUPP; + + xfs_ilock(ip, iolock); + error = xfs_break_layouts(inode, &iolock, false); + if (error) + goto out_unlock; + + xfs_ilock(ip, XFS_MMAPLOCK_EXCL); + iolock |= XFS_MMAPLOCK_EXCL; + + if (mode & FALLOC_FL_PUNCH_HOLE) { + error = xfs_free_file_space(ip, offset, len); + if (error) + goto out_unlock; + } else if (mode & FALLOC_FL_COLLAPSE_RANGE) { + unsigned blksize_mask = (1 << inode->i_blkbits) - 1; + + if (offset & blksize_mask || len & blksize_mask) { + error = -EINVAL; + goto out_unlock; + } + + /* + * There is no need to overlap collapse range with EOF, + * in which case it is effectively a truncate operation + */ + if (offset + len >= i_size_read(inode)) { + error = -EINVAL; + goto out_unlock; + } + + new_size = i_size_read(inode) - len; + + error = xfs_collapse_file_space(ip, offset, len); + if (error) + goto out_unlock; + } else if (mode & FALLOC_FL_INSERT_RANGE) { + unsigned blksize_mask = (1 << inode->i_blkbits) - 1; + + new_size = i_size_read(inode) + len; + if (offset & blksize_mask || len & blksize_mask) { + error = -EINVAL; + goto out_unlock; + } + + /* check the new inode size does not wrap through zero */ + if (new_size > inode->i_sb->s_maxbytes) { + error = -EFBIG; + goto out_unlock; + } + + /* Offset should be less than i_size */ + if (offset >= i_size_read(inode)) { + error = -EINVAL; + goto out_unlock; + } + do_file_insert = 1; + } else { + flags |= XFS_PREALLOC_SET; + + if (!(mode & FALLOC_FL_KEEP_SIZE) && + offset + len > i_size_read(inode)) { + new_size = offset + len; + error = inode_newsize_ok(inode, new_size); + if (error) + goto out_unlock; + } + + if (mode & FALLOC_FL_ZERO_RANGE) + error = xfs_zero_file_space(ip, offset, len); + else + error = xfs_alloc_file_space(ip, offset, len, + XFS_BMAPI_PREALLOC); + if (error) + goto out_unlock; + } + + if (file->f_flags & O_DSYNC) + flags |= XFS_PREALLOC_SYNC; + + error = xfs_update_prealloc_flags(ip, flags); + if (error) + goto out_unlock; + + /* Change file size if needed */ + if (new_size) { + struct iattr iattr; + + iattr.ia_valid = ATTR_SIZE; + iattr.ia_size = new_size; + error = xfs_setattr_size(ip, &iattr); + if (error) + goto out_unlock; + } + + /* + * Perform hole insertion now that the file size has been + * updated so that if we crash during the operation we don't + * leave shifted extents past EOF and hence losing access to + * the data that is contained within them. + */ + if (do_file_insert) + error = xfs_insert_file_space(ip, offset, len); + +out_unlock: + xfs_iunlock(ip, iolock); + return error; +} + + +STATIC int +xfs_file_open( + struct inode *inode, + struct file *file) +{ + if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS) + return -EFBIG; + if (XFS_FORCED_SHUTDOWN(XFS_M(inode->i_sb))) + return -EIO; + return 0; +} + +STATIC int +xfs_dir_open( + struct inode *inode, + struct file *file) +{ + struct xfs_inode *ip = XFS_I(inode); + int mode; + int error; + + error = xfs_file_open(inode, file); + if (error) + return error; + + /* + * If there are any blocks, read-ahead block 0 as we're almost + * certain to have the next operation be a read there. + */ + mode = xfs_ilock_data_map_shared(ip); + if (ip->i_d.di_nextents > 0) + xfs_dir3_data_readahead(ip, 0, -1); + xfs_iunlock(ip, mode); + return 0; +} + +STATIC int +xfs_file_release( + struct inode *inode, + struct file *filp) +{ + return xfs_release(XFS_I(inode)); +} + +STATIC int +xfs_file_readdir( + struct file *file, + struct dir_context *ctx) +{ + struct inode *inode = file_inode(file); + xfs_inode_t *ip = XFS_I(inode); + size_t bufsize; + + /* + * The Linux API doesn't pass down the total size of the buffer + * we read into down to the filesystem. With the filldir concept + * it's not needed for correct information, but the XFS dir2 leaf + * code wants an estimate of the buffer size to calculate it's + * readahead window and size the buffers used for mapping to + * physical blocks. + * + * Try to give it an estimate that's good enough, maybe at some + * point we can change the ->readdir prototype to include the + * buffer size. For now we use the current glibc buffer size. + */ + bufsize = (size_t)min_t(loff_t, 32768, ip->i_d.di_size); + + return xfs_readdir(ip, ctx, bufsize); +} + +STATIC int +xfs_file_mmap( + struct file *filp, + struct vm_area_struct *vma) +{ + vma->vm_ops = &xfs_file_vm_ops; + + file_accessed(filp); + return 0; +} + +/* + * This type is designed to indicate the type of offset we would like + * to search from page cache for xfs_seek_hole_data(). + */ +enum { + HOLE_OFF = 0, + DATA_OFF, +}; + +/* + * Lookup the desired type of offset from the given page. + * + * On success, return true and the offset argument will point to the + * start of the region that was found. Otherwise this function will + * return false and keep the offset argument unchanged. + */ +STATIC bool +xfs_lookup_buffer_offset( + struct page *page, + loff_t *offset, + unsigned int type) +{ + loff_t lastoff = page_offset(page); + bool found = false; + struct buffer_head *bh, *head; + + bh = head = page_buffers(page); + do { + /* + * Unwritten extents that have data in the page + * cache covering them can be identified by the + * BH_Unwritten state flag. Pages with multiple + * buffers might have a mix of holes, data and + * unwritten extents - any buffer with valid + * data in it should have BH_Uptodate flag set + * on it. + */ + if (buffer_unwritten(bh) || + buffer_uptodate(bh)) { + if (type == DATA_OFF) + found = true; + } else { + if (type == HOLE_OFF) + found = true; + } + + if (found) { + *offset = lastoff; + break; + } + lastoff += bh->b_size; + } while ((bh = bh->b_this_page) != head); + + return found; +} + +/* + * This routine is called to find out and return a data or hole offset + * from the page cache for unwritten extents according to the desired + * type for xfs_seek_hole_data(). + * + * The argument offset is used to tell where we start to search from the + * page cache. Map is used to figure out the end points of the range to + * lookup pages. + * + * Return true if the desired type of offset was found, and the argument + * offset is filled with that address. Otherwise, return false and keep + * offset unchanged. + */ +STATIC bool +xfs_find_get_desired_pgoff( + struct inode *inode, + struct xfs_bmbt_irec *map, + unsigned int type, + loff_t *offset) +{ + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + struct pagevec pvec; + pgoff_t index; + pgoff_t end; + loff_t endoff; + loff_t startoff = *offset; + loff_t lastoff = startoff; + bool found = false; + + pagevec_init(&pvec, 0); + + index = startoff >> PAGE_CACHE_SHIFT; + endoff = XFS_FSB_TO_B(mp, map->br_startoff + map->br_blockcount); + end = endoff >> PAGE_CACHE_SHIFT; + do { + int want; + unsigned nr_pages; + unsigned int i; + + want = min_t(pgoff_t, end - index, PAGEVEC_SIZE); + nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index, + want); + /* + * No page mapped into given range. If we are searching holes + * and if this is the first time we got into the loop, it means + * that the given offset is landed in a hole, return it. + * + * If we have already stepped through some block buffers to find + * holes but they all contains data. In this case, the last + * offset is already updated and pointed to the end of the last + * mapped page, if it does not reach the endpoint to search, + * that means there should be a hole between them. + */ + if (nr_pages == 0) { + /* Data search found nothing */ + if (type == DATA_OFF) + break; + + ASSERT(type == HOLE_OFF); + if (lastoff == startoff || lastoff < endoff) { + found = true; + *offset = lastoff; + } + break; + } + + /* + * At lease we found one page. If this is the first time we + * step into the loop, and if the first page index offset is + * greater than the given search offset, a hole was found. + */ + if (type == HOLE_OFF && lastoff == startoff && + lastoff < page_offset(pvec.pages[0])) { + found = true; + break; + } + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + loff_t b_offset; + + /* + * At this point, the page may be truncated or + * invalidated (changing page->mapping to NULL), + * or even swizzled back from swapper_space to tmpfs + * file mapping. However, page->index will not change + * because we have a reference on the page. + * + * Searching done if the page index is out of range. + * If the current offset is not reaches the end of + * the specified search range, there should be a hole + * between them. + */ + if (page->index > end) { + if (type == HOLE_OFF && lastoff < endoff) { + *offset = lastoff; + found = true; + } + goto out; + } + + lock_page(page); + /* + * Page truncated or invalidated(page->mapping == NULL). + * We can freely skip it and proceed to check the next + * page. + */ + if (unlikely(page->mapping != inode->i_mapping)) { + unlock_page(page); + continue; + } + + if (!page_has_buffers(page)) { + unlock_page(page); + continue; + } + + found = xfs_lookup_buffer_offset(page, &b_offset, type); + if (found) { + /* + * The found offset may be less than the start + * point to search if this is the first time to + * come here. + */ + *offset = max_t(loff_t, startoff, b_offset); + unlock_page(page); + goto out; + } + + /* + * We either searching data but nothing was found, or + * searching hole but found a data buffer. In either + * case, probably the next page contains the desired + * things, update the last offset to it so. + */ + lastoff = page_offset(page) + PAGE_SIZE; + unlock_page(page); + } + + /* + * The number of returned pages less than our desired, search + * done. In this case, nothing was found for searching data, + * but we found a hole behind the last offset. + */ + if (nr_pages < want) { + if (type == HOLE_OFF) { + *offset = lastoff; + found = true; + } + break; + } + + index = pvec.pages[i - 1]->index + 1; + pagevec_release(&pvec); + } while (index <= end); + +out: + pagevec_release(&pvec); + return found; +} + +STATIC loff_t +xfs_seek_hole_data( + struct file *file, + loff_t start, + int whence) +{ + struct inode *inode = file->f_mapping->host; + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + loff_t uninitialized_var(offset); + xfs_fsize_t isize; + xfs_fileoff_t fsbno; + xfs_filblks_t end; + uint lock; + int error; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + lock = xfs_ilock_data_map_shared(ip); + + isize = i_size_read(inode); + if (start >= isize) { + error = -ENXIO; + goto out_unlock; + } + + /* + * Try to read extents from the first block indicated + * by fsbno to the end block of the file. + */ + fsbno = XFS_B_TO_FSBT(mp, start); + end = XFS_B_TO_FSB(mp, isize); + + for (;;) { + struct xfs_bmbt_irec map[2]; + int nmap = 2; + unsigned int i; + + error = xfs_bmapi_read(ip, fsbno, end - fsbno, map, &nmap, + XFS_BMAPI_ENTIRE); + if (error) + goto out_unlock; + + /* No extents at given offset, must be beyond EOF */ + if (nmap == 0) { + error = -ENXIO; + goto out_unlock; + } + + for (i = 0; i < nmap; i++) { + offset = max_t(loff_t, start, + XFS_FSB_TO_B(mp, map[i].br_startoff)); + + /* Landed in the hole we wanted? */ + if (whence == SEEK_HOLE && + map[i].br_startblock == HOLESTARTBLOCK) + goto out; + + /* Landed in the data extent we wanted? */ + if (whence == SEEK_DATA && + (map[i].br_startblock == DELAYSTARTBLOCK || + (map[i].br_state == XFS_EXT_NORM && + !isnullstartblock(map[i].br_startblock)))) + goto out; + + /* + * Landed in an unwritten extent, try to search + * for hole or data from page cache. + */ + if (map[i].br_state == XFS_EXT_UNWRITTEN) { + if (xfs_find_get_desired_pgoff(inode, &map[i], + whence == SEEK_HOLE ? HOLE_OFF : DATA_OFF, + &offset)) + goto out; + } + } + + /* + * We only received one extent out of the two requested. This + * means we've hit EOF and didn't find what we are looking for. + */ + if (nmap == 1) { + /* + * If we were looking for a hole, set offset to + * the end of the file (i.e., there is an implicit + * hole at the end of any file). + */ + if (whence == SEEK_HOLE) { + offset = isize; + break; + } + /* + * If we were looking for data, it's nowhere to be found + */ + ASSERT(whence == SEEK_DATA); + error = -ENXIO; + goto out_unlock; + } + + ASSERT(i > 1); + + /* + * Nothing was found, proceed to the next round of search + * if the next reading offset is not at or beyond EOF. + */ + fsbno = map[i - 1].br_startoff + map[i - 1].br_blockcount; + start = XFS_FSB_TO_B(mp, fsbno); + if (start >= isize) { + if (whence == SEEK_HOLE) { + offset = isize; + break; + } + ASSERT(whence == SEEK_DATA); + error = -ENXIO; + goto out_unlock; + } + } + +out: + /* + * If at this point we have found the hole we wanted, the returned + * offset may be bigger than the file size as it may be aligned to + * page boundary for unwritten extents. We need to deal with this + * situation in particular. + */ + if (whence == SEEK_HOLE) + offset = min_t(loff_t, offset, isize); + offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes); + +out_unlock: + xfs_iunlock(ip, lock); + + if (error) + return error; + return offset; +} + +STATIC loff_t +xfs_file_llseek( + struct file *file, + loff_t offset, + int whence) +{ + switch (whence) { + case SEEK_END: + case SEEK_CUR: + case SEEK_SET: + return generic_file_llseek(file, offset, whence); + case SEEK_HOLE: + case SEEK_DATA: + return xfs_seek_hole_data(file, offset, whence); + default: + return -EINVAL; + } +} + +/* + * Locking for serialisation of IO during page faults. This results in a lock + * ordering of: + * + * mmap_sem (MM) + * i_mmap_lock (XFS - truncate serialisation) + * page_lock (MM) + * i_lock (XFS - extent map serialisation) + */ +STATIC int +xfs_filemap_fault( + struct vm_area_struct *vma, + struct vm_fault *vmf) +{ + struct xfs_inode *ip = XFS_I(vma->vm_file->f_mapping->host); + int error; + + trace_xfs_filemap_fault(ip); + + xfs_ilock(ip, XFS_MMAPLOCK_SHARED); + error = filemap_fault(vma, vmf); + xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); + + return error; +} + +/* + * mmap()d file has taken write protection fault and is being made writable. We + * can set the page state up correctly for a writable page, which means we can + * do correct delalloc accounting (ENOSPC checking!) and unwritten extent + * mapping. + */ +STATIC int +xfs_filemap_page_mkwrite( + struct vm_area_struct *vma, + struct vm_fault *vmf) +{ + struct xfs_inode *ip = XFS_I(vma->vm_file->f_mapping->host); + int error; + + trace_xfs_filemap_page_mkwrite(ip); + + xfs_ilock(ip, XFS_MMAPLOCK_SHARED); + error = block_page_mkwrite(vma, vmf, xfs_get_blocks); + xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); + + return error; +} + +const struct file_operations xfs_file_operations = { + .llseek = xfs_file_llseek, + .read_iter = xfs_file_read_iter, + .write_iter = xfs_file_write_iter, + .splice_read = xfs_file_splice_read, + .splice_write = iter_file_splice_write, + .unlocked_ioctl = xfs_file_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = xfs_file_compat_ioctl, +#endif + .mmap = xfs_file_mmap, + .open = xfs_file_open, + .release = xfs_file_release, + .fsync = xfs_file_fsync, + .fallocate = xfs_file_fallocate, +}; + +const struct file_operations xfs_dir_file_operations = { + .open = xfs_dir_open, + .read = generic_read_dir, + .iterate = xfs_file_readdir, + .llseek = generic_file_llseek, + .unlocked_ioctl = xfs_file_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = xfs_file_compat_ioctl, +#endif + .fsync = xfs_dir_fsync, +}; + +static const struct vm_operations_struct xfs_file_vm_ops = { + .fault = xfs_filemap_fault, + .map_pages = filemap_map_pages, + .page_mkwrite = xfs_filemap_page_mkwrite, +}; diff --git a/kernel/fs/xfs/xfs_filestream.c b/kernel/fs/xfs/xfs_filestream.c new file mode 100644 index 000000000..da82f1cb4 --- /dev/null +++ b/kernel/fs/xfs/xfs_filestream.c @@ -0,0 +1,431 @@ +/* + * Copyright (c) 2006-2007 Silicon Graphics, Inc. + * Copyright (c) 2014 Christoph Hellwig. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_bmap.h" +#include "xfs_bmap_util.h" +#include "xfs_alloc.h" +#include "xfs_mru_cache.h" +#include "xfs_filestream.h" +#include "xfs_trace.h" + +struct xfs_fstrm_item { + struct xfs_mru_cache_elem mru; + struct xfs_inode *ip; + xfs_agnumber_t ag; /* AG in use for this directory */ +}; + +enum xfs_fstrm_alloc { + XFS_PICK_USERDATA = 1, + XFS_PICK_LOWSPACE = 2, +}; + +/* + * Allocation group filestream associations are tracked with per-ag atomic + * counters. These counters allow xfs_filestream_pick_ag() to tell whether a + * particular AG already has active filestreams associated with it. The mount + * point's m_peraglock is used to protect these counters from per-ag array + * re-allocation during a growfs operation. When xfs_growfs_data_private() is + * about to reallocate the array, it calls xfs_filestream_flush() with the + * m_peraglock held in write mode. + * + * Since xfs_mru_cache_flush() guarantees that all the free functions for all + * the cache elements have finished executing before it returns, it's safe for + * the free functions to use the atomic counters without m_peraglock protection. + * This allows the implementation of xfs_fstrm_free_func() to be agnostic about + * whether it was called with the m_peraglock held in read mode, write mode or + * not held at all. The race condition this addresses is the following: + * + * - The work queue scheduler fires and pulls a filestream directory cache + * element off the LRU end of the cache for deletion, then gets pre-empted. + * - A growfs operation grabs the m_peraglock in write mode, flushes all the + * remaining items from the cache and reallocates the mount point's per-ag + * array, resetting all the counters to zero. + * - The work queue thread resumes and calls the free function for the element + * it started cleaning up earlier. In the process it decrements the + * filestreams counter for an AG that now has no references. + * + * With a shrinkfs feature, the above scenario could panic the system. + * + * All other uses of the following macros should be protected by either the + * m_peraglock held in read mode, or the cache's internal locking exposed by the + * interval between a call to xfs_mru_cache_lookup() and a call to + * xfs_mru_cache_done(). In addition, the m_peraglock must be held in read mode + * when new elements are added to the cache. + * + * Combined, these locking rules ensure that no associations will ever exist in + * the cache that reference per-ag array elements that have since been + * reallocated. + */ +int +xfs_filestream_peek_ag( + xfs_mount_t *mp, + xfs_agnumber_t agno) +{ + struct xfs_perag *pag; + int ret; + + pag = xfs_perag_get(mp, agno); + ret = atomic_read(&pag->pagf_fstrms); + xfs_perag_put(pag); + return ret; +} + +static int +xfs_filestream_get_ag( + xfs_mount_t *mp, + xfs_agnumber_t agno) +{ + struct xfs_perag *pag; + int ret; + + pag = xfs_perag_get(mp, agno); + ret = atomic_inc_return(&pag->pagf_fstrms); + xfs_perag_put(pag); + return ret; +} + +static void +xfs_filestream_put_ag( + xfs_mount_t *mp, + xfs_agnumber_t agno) +{ + struct xfs_perag *pag; + + pag = xfs_perag_get(mp, agno); + atomic_dec(&pag->pagf_fstrms); + xfs_perag_put(pag); +} + +static void +xfs_fstrm_free_func( + struct xfs_mru_cache_elem *mru) +{ + struct xfs_fstrm_item *item = + container_of(mru, struct xfs_fstrm_item, mru); + + xfs_filestream_put_ag(item->ip->i_mount, item->ag); + + trace_xfs_filestream_free(item->ip, item->ag); + + kmem_free(item); +} + +/* + * Scan the AGs starting at startag looking for an AG that isn't in use and has + * at least minlen blocks free. + */ +static int +xfs_filestream_pick_ag( + struct xfs_inode *ip, + xfs_agnumber_t startag, + xfs_agnumber_t *agp, + int flags, + xfs_extlen_t minlen) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_fstrm_item *item; + struct xfs_perag *pag; + xfs_extlen_t longest, free = 0, minfree, maxfree = 0; + xfs_agnumber_t ag, max_ag = NULLAGNUMBER; + int err, trylock, nscan; + + ASSERT(S_ISDIR(ip->i_d.di_mode)); + + /* 2% of an AG's blocks must be free for it to be chosen. */ + minfree = mp->m_sb.sb_agblocks / 50; + + ag = startag; + *agp = NULLAGNUMBER; + + /* For the first pass, don't sleep trying to init the per-AG. */ + trylock = XFS_ALLOC_FLAG_TRYLOCK; + + for (nscan = 0; 1; nscan++) { + trace_xfs_filestream_scan(ip, ag); + + pag = xfs_perag_get(mp, ag); + + if (!pag->pagf_init) { + err = xfs_alloc_pagf_init(mp, NULL, ag, trylock); + if (err && !trylock) { + xfs_perag_put(pag); + return err; + } + } + + /* Might fail sometimes during the 1st pass with trylock set. */ + if (!pag->pagf_init) + goto next_ag; + + /* Keep track of the AG with the most free blocks. */ + if (pag->pagf_freeblks > maxfree) { + maxfree = pag->pagf_freeblks; + max_ag = ag; + } + + /* + * The AG reference count does two things: it enforces mutual + * exclusion when examining the suitability of an AG in this + * loop, and it guards against two filestreams being established + * in the same AG as each other. + */ + if (xfs_filestream_get_ag(mp, ag) > 1) { + xfs_filestream_put_ag(mp, ag); + goto next_ag; + } + + longest = xfs_alloc_longest_free_extent(mp, pag); + if (((minlen && longest >= minlen) || + (!minlen && pag->pagf_freeblks >= minfree)) && + (!pag->pagf_metadata || !(flags & XFS_PICK_USERDATA) || + (flags & XFS_PICK_LOWSPACE))) { + + /* Break out, retaining the reference on the AG. */ + free = pag->pagf_freeblks; + xfs_perag_put(pag); + *agp = ag; + break; + } + + /* Drop the reference on this AG, it's not usable. */ + xfs_filestream_put_ag(mp, ag); +next_ag: + xfs_perag_put(pag); + /* Move to the next AG, wrapping to AG 0 if necessary. */ + if (++ag >= mp->m_sb.sb_agcount) + ag = 0; + + /* If a full pass of the AGs hasn't been done yet, continue. */ + if (ag != startag) + continue; + + /* Allow sleeping in xfs_alloc_pagf_init() on the 2nd pass. */ + if (trylock != 0) { + trylock = 0; + continue; + } + + /* Finally, if lowspace wasn't set, set it for the 3rd pass. */ + if (!(flags & XFS_PICK_LOWSPACE)) { + flags |= XFS_PICK_LOWSPACE; + continue; + } + + /* + * Take the AG with the most free space, regardless of whether + * it's already in use by another filestream. + */ + if (max_ag != NULLAGNUMBER) { + xfs_filestream_get_ag(mp, max_ag); + free = maxfree; + *agp = max_ag; + break; + } + + /* take AG 0 if none matched */ + trace_xfs_filestream_pick(ip, *agp, free, nscan); + *agp = 0; + return 0; + } + + trace_xfs_filestream_pick(ip, *agp, free, nscan); + + if (*agp == NULLAGNUMBER) + return 0; + + err = -ENOMEM; + item = kmem_alloc(sizeof(*item), KM_MAYFAIL); + if (!item) + goto out_put_ag; + + item->ag = *agp; + item->ip = ip; + + err = xfs_mru_cache_insert(mp->m_filestream, ip->i_ino, &item->mru); + if (err) { + if (err == -EEXIST) + err = 0; + goto out_free_item; + } + + return 0; + +out_free_item: + kmem_free(item); +out_put_ag: + xfs_filestream_put_ag(mp, *agp); + return err; +} + +static struct xfs_inode * +xfs_filestream_get_parent( + struct xfs_inode *ip) +{ + struct inode *inode = VFS_I(ip), *dir = NULL; + struct dentry *dentry, *parent; + + dentry = d_find_alias(inode); + if (!dentry) + goto out; + + parent = dget_parent(dentry); + if (!parent) + goto out_dput; + + dir = igrab(d_inode(parent)); + dput(parent); + +out_dput: + dput(dentry); +out: + return dir ? XFS_I(dir) : NULL; +} + +/* + * Find the right allocation group for a file, either by finding an + * existing file stream or creating a new one. + * + * Returns NULLAGNUMBER in case of an error. + */ +xfs_agnumber_t +xfs_filestream_lookup_ag( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_inode *pip = NULL; + xfs_agnumber_t startag, ag = NULLAGNUMBER; + struct xfs_mru_cache_elem *mru; + + ASSERT(S_ISREG(ip->i_d.di_mode)); + + pip = xfs_filestream_get_parent(ip); + if (!pip) + return NULLAGNUMBER; + + mru = xfs_mru_cache_lookup(mp->m_filestream, pip->i_ino); + if (mru) { + ag = container_of(mru, struct xfs_fstrm_item, mru)->ag; + xfs_mru_cache_done(mp->m_filestream); + + trace_xfs_filestream_lookup(ip, ag); + goto out; + } + + /* + * Set the starting AG using the rotor for inode32, otherwise + * use the directory inode's AG. + */ + if (mp->m_flags & XFS_MOUNT_32BITINODES) { + xfs_agnumber_t rotorstep = xfs_rotorstep; + startag = (mp->m_agfrotor / rotorstep) % mp->m_sb.sb_agcount; + mp->m_agfrotor = (mp->m_agfrotor + 1) % + (mp->m_sb.sb_agcount * rotorstep); + } else + startag = XFS_INO_TO_AGNO(mp, pip->i_ino); + + if (xfs_filestream_pick_ag(pip, startag, &ag, 0, 0)) + ag = NULLAGNUMBER; +out: + IRELE(pip); + return ag; +} + +/* + * Pick a new allocation group for the current file and its file stream. + * + * This is called when the allocator can't find a suitable extent in the + * current AG, and we have to move the stream into a new AG with more space. + */ +int +xfs_filestream_new_ag( + struct xfs_bmalloca *ap, + xfs_agnumber_t *agp) +{ + struct xfs_inode *ip = ap->ip, *pip; + struct xfs_mount *mp = ip->i_mount; + xfs_extlen_t minlen = ap->length; + xfs_agnumber_t startag = 0; + int flags, err = 0; + struct xfs_mru_cache_elem *mru; + + *agp = NULLAGNUMBER; + + pip = xfs_filestream_get_parent(ip); + if (!pip) + goto exit; + + mru = xfs_mru_cache_remove(mp->m_filestream, pip->i_ino); + if (mru) { + struct xfs_fstrm_item *item = + container_of(mru, struct xfs_fstrm_item, mru); + startag = (item->ag + 1) % mp->m_sb.sb_agcount; + } + + flags = (ap->userdata ? XFS_PICK_USERDATA : 0) | + (ap->flist->xbf_low ? XFS_PICK_LOWSPACE : 0); + + err = xfs_filestream_pick_ag(pip, startag, agp, flags, minlen); + + /* + * Only free the item here so we skip over the old AG earlier. + */ + if (mru) + xfs_fstrm_free_func(mru); + + IRELE(pip); +exit: + if (*agp == NULLAGNUMBER) + *agp = 0; + return err; +} + +void +xfs_filestream_deassociate( + struct xfs_inode *ip) +{ + xfs_mru_cache_delete(ip->i_mount->m_filestream, ip->i_ino); +} + +int +xfs_filestream_mount( + xfs_mount_t *mp) +{ + /* + * The filestream timer tunable is currently fixed within the range of + * one second to four minutes, with five seconds being the default. The + * group count is somewhat arbitrary, but it'd be nice to adhere to the + * timer tunable to within about 10 percent. This requires at least 10 + * groups. + */ + return xfs_mru_cache_create(&mp->m_filestream, xfs_fstrm_centisecs * 10, + 10, xfs_fstrm_free_func); +} + +void +xfs_filestream_unmount( + xfs_mount_t *mp) +{ + xfs_mru_cache_destroy(mp->m_filestream); +} diff --git a/kernel/fs/xfs/xfs_filestream.h b/kernel/fs/xfs/xfs_filestream.h new file mode 100644 index 000000000..2ef43406e --- /dev/null +++ b/kernel/fs/xfs/xfs_filestream.h @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2006-2007 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_FILESTREAM_H__ +#define __XFS_FILESTREAM_H__ + +struct xfs_mount; +struct xfs_inode; +struct xfs_bmalloca; + +int xfs_filestream_mount(struct xfs_mount *mp); +void xfs_filestream_unmount(struct xfs_mount *mp); +void xfs_filestream_deassociate(struct xfs_inode *ip); +xfs_agnumber_t xfs_filestream_lookup_ag(struct xfs_inode *ip); +int xfs_filestream_new_ag(struct xfs_bmalloca *ap, xfs_agnumber_t *agp); +int xfs_filestream_peek_ag(struct xfs_mount *mp, xfs_agnumber_t agno); + +static inline int +xfs_inode_is_filestream( + struct xfs_inode *ip) +{ + return (ip->i_mount->m_flags & XFS_MOUNT_FILESTREAMS) || + (ip->i_d.di_flags & XFS_DIFLAG_FILESTREAM); +} + +#endif /* __XFS_FILESTREAM_H__ */ diff --git a/kernel/fs/xfs/xfs_fsops.c b/kernel/fs/xfs/xfs_fsops.c new file mode 100644 index 000000000..cb7e8a29d --- /dev/null +++ b/kernel/fs/xfs/xfs_fsops.c @@ -0,0 +1,850 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_inode_item.h" +#include "xfs_error.h" +#include "xfs_btree.h" +#include "xfs_alloc_btree.h" +#include "xfs_alloc.h" +#include "xfs_ialloc.h" +#include "xfs_fsops.h" +#include "xfs_itable.h" +#include "xfs_trans_space.h" +#include "xfs_rtalloc.h" +#include "xfs_trace.h" +#include "xfs_log.h" +#include "xfs_filestream.h" + +/* + * File system operations + */ + +int +xfs_fs_geometry( + xfs_mount_t *mp, + xfs_fsop_geom_t *geo, + int new_version) +{ + + memset(geo, 0, sizeof(*geo)); + + geo->blocksize = mp->m_sb.sb_blocksize; + geo->rtextsize = mp->m_sb.sb_rextsize; + geo->agblocks = mp->m_sb.sb_agblocks; + geo->agcount = mp->m_sb.sb_agcount; + geo->logblocks = mp->m_sb.sb_logblocks; + geo->sectsize = mp->m_sb.sb_sectsize; + geo->inodesize = mp->m_sb.sb_inodesize; + geo->imaxpct = mp->m_sb.sb_imax_pct; + geo->datablocks = mp->m_sb.sb_dblocks; + geo->rtblocks = mp->m_sb.sb_rblocks; + geo->rtextents = mp->m_sb.sb_rextents; + geo->logstart = mp->m_sb.sb_logstart; + ASSERT(sizeof(geo->uuid)==sizeof(mp->m_sb.sb_uuid)); + memcpy(geo->uuid, &mp->m_sb.sb_uuid, sizeof(mp->m_sb.sb_uuid)); + if (new_version >= 2) { + geo->sunit = mp->m_sb.sb_unit; + geo->swidth = mp->m_sb.sb_width; + } + if (new_version >= 3) { + geo->version = XFS_FSOP_GEOM_VERSION; + geo->flags = XFS_FSOP_GEOM_FLAGS_NLINK | + XFS_FSOP_GEOM_FLAGS_DIRV2 | + (xfs_sb_version_hasattr(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_ATTR : 0) | + (xfs_sb_version_hasquota(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_QUOTA : 0) | + (xfs_sb_version_hasalign(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_IALIGN : 0) | + (xfs_sb_version_hasdalign(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_DALIGN : 0) | + (xfs_sb_version_hasextflgbit(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_EXTFLG : 0) | + (xfs_sb_version_hassector(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_SECTOR : 0) | + (xfs_sb_version_hasasciici(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_DIRV2CI : 0) | + (xfs_sb_version_haslazysbcount(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_LAZYSB : 0) | + (xfs_sb_version_hasattr2(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_ATTR2 : 0) | + (xfs_sb_version_hasprojid32bit(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_PROJID32 : 0) | + (xfs_sb_version_hascrc(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_V5SB : 0) | + (xfs_sb_version_hasftype(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_FTYPE : 0) | + (xfs_sb_version_hasfinobt(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_FINOBT : 0); + geo->logsectsize = xfs_sb_version_hassector(&mp->m_sb) ? + mp->m_sb.sb_logsectsize : BBSIZE; + geo->rtsectsize = mp->m_sb.sb_blocksize; + geo->dirblocksize = mp->m_dir_geo->blksize; + } + if (new_version >= 4) { + geo->flags |= + (xfs_sb_version_haslogv2(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_LOGV2 : 0); + geo->logsunit = mp->m_sb.sb_logsunit; + } + return 0; +} + +static struct xfs_buf * +xfs_growfs_get_hdr_buf( + struct xfs_mount *mp, + xfs_daddr_t blkno, + size_t numblks, + int flags, + const struct xfs_buf_ops *ops) +{ + struct xfs_buf *bp; + + bp = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, flags); + if (!bp) + return NULL; + + xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); + bp->b_bn = blkno; + bp->b_maps[0].bm_bn = blkno; + bp->b_ops = ops; + + return bp; +} + +static int +xfs_growfs_data_private( + xfs_mount_t *mp, /* mount point for filesystem */ + xfs_growfs_data_t *in) /* growfs data input struct */ +{ + xfs_agf_t *agf; + struct xfs_agfl *agfl; + xfs_agi_t *agi; + xfs_agnumber_t agno; + xfs_extlen_t agsize; + xfs_extlen_t tmpsize; + xfs_alloc_rec_t *arec; + xfs_buf_t *bp; + int bucket; + int dpct; + int error, saved_error = 0; + xfs_agnumber_t nagcount; + xfs_agnumber_t nagimax = 0; + xfs_rfsblock_t nb, nb_mod; + xfs_rfsblock_t new; + xfs_rfsblock_t nfree; + xfs_agnumber_t oagcount; + int pct; + xfs_trans_t *tp; + + nb = in->newblocks; + pct = in->imaxpct; + if (nb < mp->m_sb.sb_dblocks || pct < 0 || pct > 100) + return -EINVAL; + if ((error = xfs_sb_validate_fsb_count(&mp->m_sb, nb))) + return error; + dpct = pct - mp->m_sb.sb_imax_pct; + error = xfs_buf_read_uncached(mp->m_ddev_targp, + XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1), + XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL); + if (error) + return error; + xfs_buf_relse(bp); + + new = nb; /* use new as a temporary here */ + nb_mod = do_div(new, mp->m_sb.sb_agblocks); + nagcount = new + (nb_mod != 0); + if (nb_mod && nb_mod < XFS_MIN_AG_BLOCKS) { + nagcount--; + nb = (xfs_rfsblock_t)nagcount * mp->m_sb.sb_agblocks; + if (nb < mp->m_sb.sb_dblocks) + return -EINVAL; + } + new = nb - mp->m_sb.sb_dblocks; + oagcount = mp->m_sb.sb_agcount; + + /* allocate the new per-ag structures */ + if (nagcount > oagcount) { + error = xfs_initialize_perag(mp, nagcount, &nagimax); + if (error) + return error; + } + + tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFS); + tp->t_flags |= XFS_TRANS_RESERVE; + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growdata, + XFS_GROWFS_SPACE_RES(mp), 0); + if (error) { + xfs_trans_cancel(tp, 0); + return error; + } + + /* + * Write new AG headers to disk. Non-transactional, but written + * synchronously so they are completed prior to the growfs transaction + * being logged. + */ + nfree = 0; + for (agno = nagcount - 1; agno >= oagcount; agno--, new -= agsize) { + __be32 *agfl_bno; + + /* + * AG freespace header block + */ + bp = xfs_growfs_get_hdr_buf(mp, + XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)), + XFS_FSS_TO_BB(mp, 1), 0, + &xfs_agf_buf_ops); + if (!bp) { + error = -ENOMEM; + goto error0; + } + + agf = XFS_BUF_TO_AGF(bp); + agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC); + agf->agf_versionnum = cpu_to_be32(XFS_AGF_VERSION); + agf->agf_seqno = cpu_to_be32(agno); + if (agno == nagcount - 1) + agsize = + nb - + (agno * (xfs_rfsblock_t)mp->m_sb.sb_agblocks); + else + agsize = mp->m_sb.sb_agblocks; + agf->agf_length = cpu_to_be32(agsize); + agf->agf_roots[XFS_BTNUM_BNOi] = cpu_to_be32(XFS_BNO_BLOCK(mp)); + agf->agf_roots[XFS_BTNUM_CNTi] = cpu_to_be32(XFS_CNT_BLOCK(mp)); + agf->agf_levels[XFS_BTNUM_BNOi] = cpu_to_be32(1); + agf->agf_levels[XFS_BTNUM_CNTi] = cpu_to_be32(1); + agf->agf_flfirst = 0; + agf->agf_fllast = cpu_to_be32(XFS_AGFL_SIZE(mp) - 1); + agf->agf_flcount = 0; + tmpsize = agsize - XFS_PREALLOC_BLOCKS(mp); + agf->agf_freeblks = cpu_to_be32(tmpsize); + agf->agf_longest = cpu_to_be32(tmpsize); + if (xfs_sb_version_hascrc(&mp->m_sb)) + uuid_copy(&agf->agf_uuid, &mp->m_sb.sb_uuid); + + error = xfs_bwrite(bp); + xfs_buf_relse(bp); + if (error) + goto error0; + + /* + * AG freelist header block + */ + bp = xfs_growfs_get_hdr_buf(mp, + XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)), + XFS_FSS_TO_BB(mp, 1), 0, + &xfs_agfl_buf_ops); + if (!bp) { + error = -ENOMEM; + goto error0; + } + + agfl = XFS_BUF_TO_AGFL(bp); + if (xfs_sb_version_hascrc(&mp->m_sb)) { + agfl->agfl_magicnum = cpu_to_be32(XFS_AGFL_MAGIC); + agfl->agfl_seqno = cpu_to_be32(agno); + uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_uuid); + } + + agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, bp); + for (bucket = 0; bucket < XFS_AGFL_SIZE(mp); bucket++) + agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK); + + error = xfs_bwrite(bp); + xfs_buf_relse(bp); + if (error) + goto error0; + + /* + * AG inode header block + */ + bp = xfs_growfs_get_hdr_buf(mp, + XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)), + XFS_FSS_TO_BB(mp, 1), 0, + &xfs_agi_buf_ops); + if (!bp) { + error = -ENOMEM; + goto error0; + } + + agi = XFS_BUF_TO_AGI(bp); + agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC); + agi->agi_versionnum = cpu_to_be32(XFS_AGI_VERSION); + agi->agi_seqno = cpu_to_be32(agno); + agi->agi_length = cpu_to_be32(agsize); + agi->agi_count = 0; + agi->agi_root = cpu_to_be32(XFS_IBT_BLOCK(mp)); + agi->agi_level = cpu_to_be32(1); + agi->agi_freecount = 0; + agi->agi_newino = cpu_to_be32(NULLAGINO); + agi->agi_dirino = cpu_to_be32(NULLAGINO); + if (xfs_sb_version_hascrc(&mp->m_sb)) + uuid_copy(&agi->agi_uuid, &mp->m_sb.sb_uuid); + if (xfs_sb_version_hasfinobt(&mp->m_sb)) { + agi->agi_free_root = cpu_to_be32(XFS_FIBT_BLOCK(mp)); + agi->agi_free_level = cpu_to_be32(1); + } + for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) + agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO); + + error = xfs_bwrite(bp); + xfs_buf_relse(bp); + if (error) + goto error0; + + /* + * BNO btree root block + */ + bp = xfs_growfs_get_hdr_buf(mp, + XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)), + BTOBB(mp->m_sb.sb_blocksize), 0, + &xfs_allocbt_buf_ops); + + if (!bp) { + error = -ENOMEM; + goto error0; + } + + if (xfs_sb_version_hascrc(&mp->m_sb)) + xfs_btree_init_block(mp, bp, XFS_ABTB_CRC_MAGIC, 0, 1, + agno, XFS_BTREE_CRC_BLOCKS); + else + xfs_btree_init_block(mp, bp, XFS_ABTB_MAGIC, 0, 1, + agno, 0); + + arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1); + arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp)); + arec->ar_blockcount = cpu_to_be32( + agsize - be32_to_cpu(arec->ar_startblock)); + + error = xfs_bwrite(bp); + xfs_buf_relse(bp); + if (error) + goto error0; + + /* + * CNT btree root block + */ + bp = xfs_growfs_get_hdr_buf(mp, + XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)), + BTOBB(mp->m_sb.sb_blocksize), 0, + &xfs_allocbt_buf_ops); + if (!bp) { + error = -ENOMEM; + goto error0; + } + + if (xfs_sb_version_hascrc(&mp->m_sb)) + xfs_btree_init_block(mp, bp, XFS_ABTC_CRC_MAGIC, 0, 1, + agno, XFS_BTREE_CRC_BLOCKS); + else + xfs_btree_init_block(mp, bp, XFS_ABTC_MAGIC, 0, 1, + agno, 0); + + arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1); + arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp)); + arec->ar_blockcount = cpu_to_be32( + agsize - be32_to_cpu(arec->ar_startblock)); + nfree += be32_to_cpu(arec->ar_blockcount); + + error = xfs_bwrite(bp); + xfs_buf_relse(bp); + if (error) + goto error0; + + /* + * INO btree root block + */ + bp = xfs_growfs_get_hdr_buf(mp, + XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)), + BTOBB(mp->m_sb.sb_blocksize), 0, + &xfs_inobt_buf_ops); + if (!bp) { + error = -ENOMEM; + goto error0; + } + + if (xfs_sb_version_hascrc(&mp->m_sb)) + xfs_btree_init_block(mp, bp, XFS_IBT_CRC_MAGIC, 0, 0, + agno, XFS_BTREE_CRC_BLOCKS); + else + xfs_btree_init_block(mp, bp, XFS_IBT_MAGIC, 0, 0, + agno, 0); + + error = xfs_bwrite(bp); + xfs_buf_relse(bp); + if (error) + goto error0; + + /* + * FINO btree root block + */ + if (xfs_sb_version_hasfinobt(&mp->m_sb)) { + bp = xfs_growfs_get_hdr_buf(mp, + XFS_AGB_TO_DADDR(mp, agno, XFS_FIBT_BLOCK(mp)), + BTOBB(mp->m_sb.sb_blocksize), 0, + &xfs_inobt_buf_ops); + if (!bp) { + error = -ENOMEM; + goto error0; + } + + if (xfs_sb_version_hascrc(&mp->m_sb)) + xfs_btree_init_block(mp, bp, XFS_FIBT_CRC_MAGIC, + 0, 0, agno, + XFS_BTREE_CRC_BLOCKS); + else + xfs_btree_init_block(mp, bp, XFS_FIBT_MAGIC, 0, + 0, agno, 0); + + error = xfs_bwrite(bp); + xfs_buf_relse(bp); + if (error) + goto error0; + } + + } + xfs_trans_agblocks_delta(tp, nfree); + /* + * There are new blocks in the old last a.g. + */ + if (new) { + /* + * Change the agi length. + */ + error = xfs_ialloc_read_agi(mp, tp, agno, &bp); + if (error) { + goto error0; + } + ASSERT(bp); + agi = XFS_BUF_TO_AGI(bp); + be32_add_cpu(&agi->agi_length, new); + ASSERT(nagcount == oagcount || + be32_to_cpu(agi->agi_length) == mp->m_sb.sb_agblocks); + xfs_ialloc_log_agi(tp, bp, XFS_AGI_LENGTH); + /* + * Change agf length. + */ + error = xfs_alloc_read_agf(mp, tp, agno, 0, &bp); + if (error) { + goto error0; + } + ASSERT(bp); + agf = XFS_BUF_TO_AGF(bp); + be32_add_cpu(&agf->agf_length, new); + ASSERT(be32_to_cpu(agf->agf_length) == + be32_to_cpu(agi->agi_length)); + + xfs_alloc_log_agf(tp, bp, XFS_AGF_LENGTH); + /* + * Free the new space. + */ + error = xfs_free_extent(tp, XFS_AGB_TO_FSB(mp, agno, + be32_to_cpu(agf->agf_length) - new), new); + if (error) { + goto error0; + } + } + + /* + * Update changed superblock fields transactionally. These are not + * seen by the rest of the world until the transaction commit applies + * them atomically to the superblock. + */ + if (nagcount > oagcount) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_AGCOUNT, nagcount - oagcount); + if (nb > mp->m_sb.sb_dblocks) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_DBLOCKS, + nb - mp->m_sb.sb_dblocks); + if (nfree) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, nfree); + if (dpct) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_IMAXPCT, dpct); + xfs_trans_set_sync(tp); + error = xfs_trans_commit(tp, 0); + if (error) + return error; + + /* New allocation groups fully initialized, so update mount struct */ + if (nagimax) + mp->m_maxagi = nagimax; + if (mp->m_sb.sb_imax_pct) { + __uint64_t icount = mp->m_sb.sb_dblocks * mp->m_sb.sb_imax_pct; + do_div(icount, 100); + mp->m_maxicount = icount << mp->m_sb.sb_inopblog; + } else + mp->m_maxicount = 0; + xfs_set_low_space_thresholds(mp); + + /* update secondary superblocks. */ + for (agno = 1; agno < nagcount; agno++) { + error = 0; + /* + * new secondary superblocks need to be zeroed, not read from + * disk as the contents of the new area we are growing into is + * completely unknown. + */ + if (agno < oagcount) { + error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, + XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)), + XFS_FSS_TO_BB(mp, 1), 0, &bp, + &xfs_sb_buf_ops); + } else { + bp = xfs_trans_get_buf(NULL, mp->m_ddev_targp, + XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)), + XFS_FSS_TO_BB(mp, 1), 0); + if (bp) { + bp->b_ops = &xfs_sb_buf_ops; + xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); + } else + error = -ENOMEM; + } + + /* + * If we get an error reading or writing alternate superblocks, + * continue. xfs_repair chooses the "best" superblock based + * on most matches; if we break early, we'll leave more + * superblocks un-updated than updated, and xfs_repair may + * pick them over the properly-updated primary. + */ + if (error) { + xfs_warn(mp, + "error %d reading secondary superblock for ag %d", + error, agno); + saved_error = error; + continue; + } + xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb); + + error = xfs_bwrite(bp); + xfs_buf_relse(bp); + if (error) { + xfs_warn(mp, + "write error %d updating secondary superblock for ag %d", + error, agno); + saved_error = error; + continue; + } + } + return saved_error ? saved_error : error; + + error0: + xfs_trans_cancel(tp, XFS_TRANS_ABORT); + return error; +} + +static int +xfs_growfs_log_private( + xfs_mount_t *mp, /* mount point for filesystem */ + xfs_growfs_log_t *in) /* growfs log input struct */ +{ + xfs_extlen_t nb; + + nb = in->newblocks; + if (nb < XFS_MIN_LOG_BLOCKS || nb < XFS_B_TO_FSB(mp, XFS_MIN_LOG_BYTES)) + return -EINVAL; + if (nb == mp->m_sb.sb_logblocks && + in->isint == (mp->m_sb.sb_logstart != 0)) + return -EINVAL; + /* + * Moving the log is hard, need new interfaces to sync + * the log first, hold off all activity while moving it. + * Can have shorter or longer log in the same space, + * or transform internal to external log or vice versa. + */ + return -ENOSYS; +} + +/* + * protected versions of growfs function acquire and release locks on the mount + * point - exported through ioctls: XFS_IOC_FSGROWFSDATA, XFS_IOC_FSGROWFSLOG, + * XFS_IOC_FSGROWFSRT + */ + + +int +xfs_growfs_data( + xfs_mount_t *mp, + xfs_growfs_data_t *in) +{ + int error; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (!mutex_trylock(&mp->m_growlock)) + return -EWOULDBLOCK; + error = xfs_growfs_data_private(mp, in); + /* + * Increment the generation unconditionally, the error could be from + * updating the secondary superblocks, in which case the new size + * is live already. + */ + mp->m_generation++; + mutex_unlock(&mp->m_growlock); + return error; +} + +int +xfs_growfs_log( + xfs_mount_t *mp, + xfs_growfs_log_t *in) +{ + int error; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (!mutex_trylock(&mp->m_growlock)) + return -EWOULDBLOCK; + error = xfs_growfs_log_private(mp, in); + mutex_unlock(&mp->m_growlock); + return error; +} + +/* + * exported through ioctl XFS_IOC_FSCOUNTS + */ + +int +xfs_fs_counts( + xfs_mount_t *mp, + xfs_fsop_counts_t *cnt) +{ + cnt->allocino = percpu_counter_read_positive(&mp->m_icount); + cnt->freeino = percpu_counter_read_positive(&mp->m_ifree); + cnt->freedata = percpu_counter_read_positive(&mp->m_fdblocks) - + XFS_ALLOC_SET_ASIDE(mp); + + spin_lock(&mp->m_sb_lock); + cnt->freertx = mp->m_sb.sb_frextents; + spin_unlock(&mp->m_sb_lock); + return 0; +} + +/* + * exported through ioctl XFS_IOC_SET_RESBLKS & XFS_IOC_GET_RESBLKS + * + * xfs_reserve_blocks is called to set m_resblks + * in the in-core mount table. The number of unused reserved blocks + * is kept in m_resblks_avail. + * + * Reserve the requested number of blocks if available. Otherwise return + * as many as possible to satisfy the request. The actual number + * reserved are returned in outval + * + * A null inval pointer indicates that only the current reserved blocks + * available should be returned no settings are changed. + */ + +int +xfs_reserve_blocks( + xfs_mount_t *mp, + __uint64_t *inval, + xfs_fsop_resblks_t *outval) +{ + __int64_t lcounter, delta, fdblks_delta; + __uint64_t request; + + /* If inval is null, report current values and return */ + if (inval == (__uint64_t *)NULL) { + if (!outval) + return -EINVAL; + outval->resblks = mp->m_resblks; + outval->resblks_avail = mp->m_resblks_avail; + return 0; + } + + request = *inval; + + /* + * With per-cpu counters, this becomes an interesting + * problem. we needto work out if we are freeing or allocation + * blocks first, then we can do the modification as necessary. + * + * We do this under the m_sb_lock so that if we are near + * ENOSPC, we will hold out any changes while we work out + * what to do. This means that the amount of free space can + * change while we do this, so we need to retry if we end up + * trying to reserve more space than is available. + */ +retry: + spin_lock(&mp->m_sb_lock); + + /* + * If our previous reservation was larger than the current value, + * then move any unused blocks back to the free pool. + */ + fdblks_delta = 0; + if (mp->m_resblks > request) { + lcounter = mp->m_resblks_avail - request; + if (lcounter > 0) { /* release unused blocks */ + fdblks_delta = lcounter; + mp->m_resblks_avail -= lcounter; + } + mp->m_resblks = request; + } else { + __int64_t free; + + free = percpu_counter_sum(&mp->m_fdblocks) - + XFS_ALLOC_SET_ASIDE(mp); + if (!free) + goto out; /* ENOSPC and fdblks_delta = 0 */ + + delta = request - mp->m_resblks; + lcounter = free - delta; + if (lcounter < 0) { + /* We can't satisfy the request, just get what we can */ + mp->m_resblks += free; + mp->m_resblks_avail += free; + fdblks_delta = -free; + } else { + fdblks_delta = -delta; + mp->m_resblks = request; + mp->m_resblks_avail += delta; + } + } +out: + if (outval) { + outval->resblks = mp->m_resblks; + outval->resblks_avail = mp->m_resblks_avail; + } + spin_unlock(&mp->m_sb_lock); + + if (fdblks_delta) { + /* + * If we are putting blocks back here, m_resblks_avail is + * already at its max so this will put it in the free pool. + * + * If we need space, we'll either succeed in getting it + * from the free block count or we'll get an enospc. If + * we get a ENOSPC, it means things changed while we were + * calculating fdblks_delta and so we should try again to + * see if there is anything left to reserve. + * + * Don't set the reserved flag here - we don't want to reserve + * the extra reserve blocks from the reserve..... + */ + int error; + error = xfs_mod_fdblocks(mp, fdblks_delta, 0); + if (error == -ENOSPC) + goto retry; + } + return 0; +} + +int +xfs_fs_goingdown( + xfs_mount_t *mp, + __uint32_t inflags) +{ + switch (inflags) { + case XFS_FSOP_GOING_FLAGS_DEFAULT: { + struct super_block *sb = freeze_bdev(mp->m_super->s_bdev); + + if (sb && !IS_ERR(sb)) { + xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT); + thaw_bdev(sb->s_bdev, sb); + } + + break; + } + case XFS_FSOP_GOING_FLAGS_LOGFLUSH: + xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT); + break; + case XFS_FSOP_GOING_FLAGS_NOLOGFLUSH: + xfs_force_shutdown(mp, + SHUTDOWN_FORCE_UMOUNT | SHUTDOWN_LOG_IO_ERROR); + break; + default: + return -EINVAL; + } + + return 0; +} + +/* + * Force a shutdown of the filesystem instantly while keeping the filesystem + * consistent. We don't do an unmount here; just shutdown the shop, make sure + * that absolutely nothing persistent happens to this filesystem after this + * point. + */ +void +xfs_do_force_shutdown( + xfs_mount_t *mp, + int flags, + char *fname, + int lnnum) +{ + int logerror; + + logerror = flags & SHUTDOWN_LOG_IO_ERROR; + + if (!(flags & SHUTDOWN_FORCE_UMOUNT)) { + xfs_notice(mp, + "%s(0x%x) called from line %d of file %s. Return address = 0x%p", + __func__, flags, lnnum, fname, __return_address); + } + /* + * No need to duplicate efforts. + */ + if (XFS_FORCED_SHUTDOWN(mp) && !logerror) + return; + + /* + * This flags XFS_MOUNT_FS_SHUTDOWN, makes sure that we don't + * queue up anybody new on the log reservations, and wakes up + * everybody who's sleeping on log reservations to tell them + * the bad news. + */ + if (xfs_log_force_umount(mp, logerror)) + return; + + if (flags & SHUTDOWN_CORRUPT_INCORE) { + xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_CORRUPT, + "Corruption of in-memory data detected. Shutting down filesystem"); + if (XFS_ERRLEVEL_HIGH <= xfs_error_level) + xfs_stack_trace(); + } else if (!(flags & SHUTDOWN_FORCE_UMOUNT)) { + if (logerror) { + xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_LOGERROR, + "Log I/O Error Detected. Shutting down filesystem"); + } else if (flags & SHUTDOWN_DEVICE_REQ) { + xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR, + "All device paths lost. Shutting down filesystem"); + } else if (!(flags & SHUTDOWN_REMOTE_REQ)) { + xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR, + "I/O Error Detected. Shutting down filesystem"); + } + } + if (!(flags & SHUTDOWN_FORCE_UMOUNT)) { + xfs_alert(mp, + "Please umount the filesystem and rectify the problem(s)"); + } +} diff --git a/kernel/fs/xfs/xfs_fsops.h b/kernel/fs/xfs/xfs_fsops.h new file mode 100644 index 000000000..1b6a98b66 --- /dev/null +++ b/kernel/fs/xfs/xfs_fsops.h @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_FSOPS_H__ +#define __XFS_FSOPS_H__ + +extern int xfs_fs_geometry(xfs_mount_t *mp, xfs_fsop_geom_t *geo, int nversion); +extern int xfs_growfs_data(xfs_mount_t *mp, xfs_growfs_data_t *in); +extern int xfs_growfs_log(xfs_mount_t *mp, xfs_growfs_log_t *in); +extern int xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt); +extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval, + xfs_fsop_resblks_t *outval); +extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags); +extern int xfs_fs_log_dummy(struct xfs_mount *mp); + +#endif /* __XFS_FSOPS_H__ */ diff --git a/kernel/fs/xfs/xfs_globals.c b/kernel/fs/xfs/xfs_globals.c new file mode 100644 index 000000000..4d41b2412 --- /dev/null +++ b/kernel/fs/xfs/xfs_globals.c @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_sysctl.h" + +/* + * Tunable XFS parameters. xfs_params is required even when CONFIG_SYSCTL=n, + * other XFS code uses these values. Times are measured in centisecs (i.e. + * 100ths of a second) with the exception of eofb_timer, which is measured in + * seconds. + */ +xfs_param_t xfs_params = { + /* MIN DFLT MAX */ + .sgid_inherit = { 0, 0, 1 }, + .symlink_mode = { 0, 0, 1 }, + .panic_mask = { 0, 0, 255 }, + .error_level = { 0, 3, 11 }, + .syncd_timer = { 1*100, 30*100, 7200*100}, + .stats_clear = { 0, 0, 1 }, + .inherit_sync = { 0, 1, 1 }, + .inherit_nodump = { 0, 1, 1 }, + .inherit_noatim = { 0, 1, 1 }, + .xfs_buf_timer = { 100/2, 1*100, 30*100 }, + .xfs_buf_age = { 1*100, 15*100, 7200*100}, + .inherit_nosym = { 0, 0, 1 }, + .rotorstep = { 1, 1, 255 }, + .inherit_nodfrg = { 0, 1, 1 }, + .fstrm_timer = { 1, 30*100, 3600*100}, + .eofb_timer = { 1, 300, 3600*24}, +}; + +struct xfs_globals xfs_globals = { + .log_recovery_delay = 0, /* no delay by default */ +}; diff --git a/kernel/fs/xfs/xfs_icache.c b/kernel/fs/xfs/xfs_icache.c new file mode 100644 index 000000000..76a9f2783 --- /dev/null +++ b/kernel/fs/xfs/xfs_icache.c @@ -0,0 +1,1418 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_error.h" +#include "xfs_trans.h" +#include "xfs_trans_priv.h" +#include "xfs_inode_item.h" +#include "xfs_quota.h" +#include "xfs_trace.h" +#include "xfs_icache.h" +#include "xfs_bmap_util.h" +#include "xfs_dquot_item.h" +#include "xfs_dquot.h" + +#include +#include + +STATIC void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, + struct xfs_perag *pag, struct xfs_inode *ip); + +/* + * Allocate and initialise an xfs_inode. + */ +struct xfs_inode * +xfs_inode_alloc( + struct xfs_mount *mp, + xfs_ino_t ino) +{ + struct xfs_inode *ip; + + /* + * if this didn't occur in transactions, we could use + * KM_MAYFAIL and return NULL here on ENOMEM. Set the + * code up to do this anyway. + */ + ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP); + if (!ip) + return NULL; + if (inode_init_always(mp->m_super, VFS_I(ip))) { + kmem_zone_free(xfs_inode_zone, ip); + return NULL; + } + + XFS_STATS_INC(vn_active); + ASSERT(atomic_read(&ip->i_pincount) == 0); + ASSERT(!spin_is_locked(&ip->i_flags_lock)); + ASSERT(!xfs_isiflocked(ip)); + ASSERT(ip->i_ino == 0); + + mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); + + /* initialise the xfs inode */ + ip->i_ino = ino; + ip->i_mount = mp; + memset(&ip->i_imap, 0, sizeof(struct xfs_imap)); + ip->i_afp = NULL; + memset(&ip->i_df, 0, sizeof(xfs_ifork_t)); + ip->i_flags = 0; + ip->i_delayed_blks = 0; + memset(&ip->i_d, 0, sizeof(xfs_icdinode_t)); + + return ip; +} + +STATIC void +xfs_inode_free_callback( + struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + struct xfs_inode *ip = XFS_I(inode); + + kmem_zone_free(xfs_inode_zone, ip); +} + +void +xfs_inode_free( + struct xfs_inode *ip) +{ + switch (ip->i_d.di_mode & S_IFMT) { + case S_IFREG: + case S_IFDIR: + case S_IFLNK: + xfs_idestroy_fork(ip, XFS_DATA_FORK); + break; + } + + if (ip->i_afp) + xfs_idestroy_fork(ip, XFS_ATTR_FORK); + + if (ip->i_itemp) { + ASSERT(!(ip->i_itemp->ili_item.li_flags & XFS_LI_IN_AIL)); + xfs_inode_item_destroy(ip); + ip->i_itemp = NULL; + } + + /* + * Because we use RCU freeing we need to ensure the inode always + * appears to be reclaimed with an invalid inode number when in the + * free state. The ip->i_flags_lock provides the barrier against lookup + * races. + */ + spin_lock(&ip->i_flags_lock); + ip->i_flags = XFS_IRECLAIM; + ip->i_ino = 0; + spin_unlock(&ip->i_flags_lock); + + /* asserts to verify all state is correct here */ + ASSERT(atomic_read(&ip->i_pincount) == 0); + ASSERT(!xfs_isiflocked(ip)); + XFS_STATS_DEC(vn_active); + + call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback); +} + +/* + * Check the validity of the inode we just found it the cache + */ +static int +xfs_iget_cache_hit( + struct xfs_perag *pag, + struct xfs_inode *ip, + xfs_ino_t ino, + int flags, + int lock_flags) __releases(RCU) +{ + struct inode *inode = VFS_I(ip); + struct xfs_mount *mp = ip->i_mount; + int error; + + /* + * check for re-use of an inode within an RCU grace period due to the + * radix tree nodes not being updated yet. We monitor for this by + * setting the inode number to zero before freeing the inode structure. + * If the inode has been reallocated and set up, then the inode number + * will not match, so check for that, too. + */ + spin_lock(&ip->i_flags_lock); + if (ip->i_ino != ino) { + trace_xfs_iget_skip(ip); + XFS_STATS_INC(xs_ig_frecycle); + error = -EAGAIN; + goto out_error; + } + + + /* + * If we are racing with another cache hit that is currently + * instantiating this inode or currently recycling it out of + * reclaimabe state, wait for the initialisation to complete + * before continuing. + * + * XXX(hch): eventually we should do something equivalent to + * wait_on_inode to wait for these flags to be cleared + * instead of polling for it. + */ + if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) { + trace_xfs_iget_skip(ip); + XFS_STATS_INC(xs_ig_frecycle); + error = -EAGAIN; + goto out_error; + } + + /* + * If lookup is racing with unlink return an error immediately. + */ + if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) { + error = -ENOENT; + goto out_error; + } + + /* + * If IRECLAIMABLE is set, we've torn down the VFS inode already. + * Need to carefully get it back into useable state. + */ + if (ip->i_flags & XFS_IRECLAIMABLE) { + trace_xfs_iget_reclaim(ip); + + /* + * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode + * from stomping over us while we recycle the inode. We can't + * clear the radix tree reclaimable tag yet as it requires + * pag_ici_lock to be held exclusive. + */ + ip->i_flags |= XFS_IRECLAIM; + + spin_unlock(&ip->i_flags_lock); + rcu_read_unlock(); + + error = inode_init_always(mp->m_super, inode); + if (error) { + /* + * Re-initializing the inode failed, and we are in deep + * trouble. Try to re-add it to the reclaim list. + */ + rcu_read_lock(); + spin_lock(&ip->i_flags_lock); + + ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM); + ASSERT(ip->i_flags & XFS_IRECLAIMABLE); + trace_xfs_iget_reclaim_fail(ip); + goto out_error; + } + + spin_lock(&pag->pag_ici_lock); + spin_lock(&ip->i_flags_lock); + + /* + * Clear the per-lifetime state in the inode as we are now + * effectively a new inode and need to return to the initial + * state before reuse occurs. + */ + ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS; + ip->i_flags |= XFS_INEW; + __xfs_inode_clear_reclaim_tag(mp, pag, ip); + inode->i_state = I_NEW; + + ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock)); + mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); + + spin_unlock(&ip->i_flags_lock); + spin_unlock(&pag->pag_ici_lock); + } else { + /* If the VFS inode is being torn down, pause and try again. */ + if (!igrab(inode)) { + trace_xfs_iget_skip(ip); + error = -EAGAIN; + goto out_error; + } + + /* We've got a live one. */ + spin_unlock(&ip->i_flags_lock); + rcu_read_unlock(); + trace_xfs_iget_hit(ip); + } + + if (lock_flags != 0) + xfs_ilock(ip, lock_flags); + + xfs_iflags_clear(ip, XFS_ISTALE | XFS_IDONTCACHE); + XFS_STATS_INC(xs_ig_found); + + return 0; + +out_error: + spin_unlock(&ip->i_flags_lock); + rcu_read_unlock(); + return error; +} + + +static int +xfs_iget_cache_miss( + struct xfs_mount *mp, + struct xfs_perag *pag, + xfs_trans_t *tp, + xfs_ino_t ino, + struct xfs_inode **ipp, + int flags, + int lock_flags) +{ + struct xfs_inode *ip; + int error; + xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino); + int iflags; + + ip = xfs_inode_alloc(mp, ino); + if (!ip) + return -ENOMEM; + + error = xfs_iread(mp, tp, ip, flags); + if (error) + goto out_destroy; + + trace_xfs_iget_miss(ip); + + if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) { + error = -ENOENT; + goto out_destroy; + } + + /* + * Preload the radix tree so we can insert safely under the + * write spinlock. Note that we cannot sleep inside the preload + * region. Since we can be called from transaction context, don't + * recurse into the file system. + */ + if (radix_tree_preload(GFP_NOFS)) { + error = -EAGAIN; + goto out_destroy; + } + + /* + * Because the inode hasn't been added to the radix-tree yet it can't + * be found by another thread, so we can do the non-sleeping lock here. + */ + if (lock_flags) { + if (!xfs_ilock_nowait(ip, lock_flags)) + BUG(); + } + + /* + * These values must be set before inserting the inode into the radix + * tree as the moment it is inserted a concurrent lookup (allowed by the + * RCU locking mechanism) can find it and that lookup must see that this + * is an inode currently under construction (i.e. that XFS_INEW is set). + * The ip->i_flags_lock that protects the XFS_INEW flag forms the + * memory barrier that ensures this detection works correctly at lookup + * time. + */ + iflags = XFS_INEW; + if (flags & XFS_IGET_DONTCACHE) + iflags |= XFS_IDONTCACHE; + ip->i_udquot = NULL; + ip->i_gdquot = NULL; + ip->i_pdquot = NULL; + xfs_iflags_set(ip, iflags); + + /* insert the new inode */ + spin_lock(&pag->pag_ici_lock); + error = radix_tree_insert(&pag->pag_ici_root, agino, ip); + if (unlikely(error)) { + WARN_ON(error != -EEXIST); + XFS_STATS_INC(xs_ig_dup); + error = -EAGAIN; + goto out_preload_end; + } + spin_unlock(&pag->pag_ici_lock); + radix_tree_preload_end(); + + *ipp = ip; + return 0; + +out_preload_end: + spin_unlock(&pag->pag_ici_lock); + radix_tree_preload_end(); + if (lock_flags) + xfs_iunlock(ip, lock_flags); +out_destroy: + __destroy_inode(VFS_I(ip)); + xfs_inode_free(ip); + return error; +} + +/* + * Look up an inode by number in the given file system. + * The inode is looked up in the cache held in each AG. + * If the inode is found in the cache, initialise the vfs inode + * if necessary. + * + * If it is not in core, read it in from the file system's device, + * add it to the cache and initialise the vfs inode. + * + * The inode is locked according to the value of the lock_flags parameter. + * This flag parameter indicates how and if the inode's IO lock and inode lock + * should be taken. + * + * mp -- the mount point structure for the current file system. It points + * to the inode hash table. + * tp -- a pointer to the current transaction if there is one. This is + * simply passed through to the xfs_iread() call. + * ino -- the number of the inode desired. This is the unique identifier + * within the file system for the inode being requested. + * lock_flags -- flags indicating how to lock the inode. See the comment + * for xfs_ilock() for a list of valid values. + */ +int +xfs_iget( + xfs_mount_t *mp, + xfs_trans_t *tp, + xfs_ino_t ino, + uint flags, + uint lock_flags, + xfs_inode_t **ipp) +{ + xfs_inode_t *ip; + int error; + xfs_perag_t *pag; + xfs_agino_t agino; + + /* + * xfs_reclaim_inode() uses the ILOCK to ensure an inode + * doesn't get freed while it's being referenced during a + * radix tree traversal here. It assumes this function + * aqcuires only the ILOCK (and therefore it has no need to + * involve the IOLOCK in this synchronization). + */ + ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0); + + /* reject inode numbers outside existing AGs */ + if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount) + return -EINVAL; + + /* get the perag structure and ensure that it's inode capable */ + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino)); + agino = XFS_INO_TO_AGINO(mp, ino); + +again: + error = 0; + rcu_read_lock(); + ip = radix_tree_lookup(&pag->pag_ici_root, agino); + + if (ip) { + error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags); + if (error) + goto out_error_or_again; + } else { + rcu_read_unlock(); + XFS_STATS_INC(xs_ig_missed); + + error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip, + flags, lock_flags); + if (error) + goto out_error_or_again; + } + xfs_perag_put(pag); + + *ipp = ip; + + /* + * If we have a real type for an on-disk inode, we can setup the inode + * now. If it's a new inode being created, xfs_ialloc will handle it. + */ + if (xfs_iflags_test(ip, XFS_INEW) && ip->i_d.di_mode != 0) + xfs_setup_existing_inode(ip); + return 0; + +out_error_or_again: + if (error == -EAGAIN) { + delay(1); + goto again; + } + xfs_perag_put(pag); + return error; +} + +/* + * The inode lookup is done in batches to keep the amount of lock traffic and + * radix tree lookups to a minimum. The batch size is a trade off between + * lookup reduction and stack usage. This is in the reclaim path, so we can't + * be too greedy. + */ +#define XFS_LOOKUP_BATCH 32 + +STATIC int +xfs_inode_ag_walk_grab( + struct xfs_inode *ip) +{ + struct inode *inode = VFS_I(ip); + + ASSERT(rcu_read_lock_held()); + + /* + * check for stale RCU freed inode + * + * If the inode has been reallocated, it doesn't matter if it's not in + * the AG we are walking - we are walking for writeback, so if it + * passes all the "valid inode" checks and is dirty, then we'll write + * it back anyway. If it has been reallocated and still being + * initialised, the XFS_INEW check below will catch it. + */ + spin_lock(&ip->i_flags_lock); + if (!ip->i_ino) + goto out_unlock_noent; + + /* avoid new or reclaimable inodes. Leave for reclaim code to flush */ + if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM)) + goto out_unlock_noent; + spin_unlock(&ip->i_flags_lock); + + /* nothing to sync during shutdown */ + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + return -EFSCORRUPTED; + + /* If we can't grab the inode, it must on it's way to reclaim. */ + if (!igrab(inode)) + return -ENOENT; + + /* inode is valid */ + return 0; + +out_unlock_noent: + spin_unlock(&ip->i_flags_lock); + return -ENOENT; +} + +STATIC int +xfs_inode_ag_walk( + struct xfs_mount *mp, + struct xfs_perag *pag, + int (*execute)(struct xfs_inode *ip, int flags, + void *args), + int flags, + void *args, + int tag) +{ + uint32_t first_index; + int last_error = 0; + int skipped; + int done; + int nr_found; + +restart: + done = 0; + skipped = 0; + first_index = 0; + nr_found = 0; + do { + struct xfs_inode *batch[XFS_LOOKUP_BATCH]; + int error = 0; + int i; + + rcu_read_lock(); + + if (tag == -1) + nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, + (void **)batch, first_index, + XFS_LOOKUP_BATCH); + else + nr_found = radix_tree_gang_lookup_tag( + &pag->pag_ici_root, + (void **) batch, first_index, + XFS_LOOKUP_BATCH, tag); + + if (!nr_found) { + rcu_read_unlock(); + break; + } + + /* + * Grab the inodes before we drop the lock. if we found + * nothing, nr == 0 and the loop will be skipped. + */ + for (i = 0; i < nr_found; i++) { + struct xfs_inode *ip = batch[i]; + + if (done || xfs_inode_ag_walk_grab(ip)) + batch[i] = NULL; + + /* + * Update the index for the next lookup. Catch + * overflows into the next AG range which can occur if + * we have inodes in the last block of the AG and we + * are currently pointing to the last inode. + * + * Because we may see inodes that are from the wrong AG + * due to RCU freeing and reallocation, only update the + * index if it lies in this AG. It was a race that lead + * us to see this inode, so another lookup from the + * same index will not find it again. + */ + if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno) + continue; + first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); + if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) + done = 1; + } + + /* unlock now we've grabbed the inodes. */ + rcu_read_unlock(); + + for (i = 0; i < nr_found; i++) { + if (!batch[i]) + continue; + error = execute(batch[i], flags, args); + IRELE(batch[i]); + if (error == -EAGAIN) { + skipped++; + continue; + } + if (error && last_error != -EFSCORRUPTED) + last_error = error; + } + + /* bail out if the filesystem is corrupted. */ + if (error == -EFSCORRUPTED) + break; + + cond_resched(); + + } while (nr_found && !done); + + if (skipped) { + delay(1); + goto restart; + } + return last_error; +} + +/* + * Background scanning to trim post-EOF preallocated space. This is queued + * based on the 'speculative_prealloc_lifetime' tunable (5m by default). + */ +STATIC void +xfs_queue_eofblocks( + struct xfs_mount *mp) +{ + rcu_read_lock(); + if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_EOFBLOCKS_TAG)) + queue_delayed_work(mp->m_eofblocks_workqueue, + &mp->m_eofblocks_work, + msecs_to_jiffies(xfs_eofb_secs * 1000)); + rcu_read_unlock(); +} + +void +xfs_eofblocks_worker( + struct work_struct *work) +{ + struct xfs_mount *mp = container_of(to_delayed_work(work), + struct xfs_mount, m_eofblocks_work); + xfs_icache_free_eofblocks(mp, NULL); + xfs_queue_eofblocks(mp); +} + +int +xfs_inode_ag_iterator( + struct xfs_mount *mp, + int (*execute)(struct xfs_inode *ip, int flags, + void *args), + int flags, + void *args) +{ + struct xfs_perag *pag; + int error = 0; + int last_error = 0; + xfs_agnumber_t ag; + + ag = 0; + while ((pag = xfs_perag_get(mp, ag))) { + ag = pag->pag_agno + 1; + error = xfs_inode_ag_walk(mp, pag, execute, flags, args, -1); + xfs_perag_put(pag); + if (error) { + last_error = error; + if (error == -EFSCORRUPTED) + break; + } + } + return last_error; +} + +int +xfs_inode_ag_iterator_tag( + struct xfs_mount *mp, + int (*execute)(struct xfs_inode *ip, int flags, + void *args), + int flags, + void *args, + int tag) +{ + struct xfs_perag *pag; + int error = 0; + int last_error = 0; + xfs_agnumber_t ag; + + ag = 0; + while ((pag = xfs_perag_get_tag(mp, ag, tag))) { + ag = pag->pag_agno + 1; + error = xfs_inode_ag_walk(mp, pag, execute, flags, args, tag); + xfs_perag_put(pag); + if (error) { + last_error = error; + if (error == -EFSCORRUPTED) + break; + } + } + return last_error; +} + +/* + * Queue a new inode reclaim pass if there are reclaimable inodes and there + * isn't a reclaim pass already in progress. By default it runs every 5s based + * on the xfs periodic sync default of 30s. Perhaps this should have it's own + * tunable, but that can be done if this method proves to be ineffective or too + * aggressive. + */ +static void +xfs_reclaim_work_queue( + struct xfs_mount *mp) +{ + + rcu_read_lock(); + if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) { + queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work, + msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10)); + } + rcu_read_unlock(); +} + +/* + * This is a fast pass over the inode cache to try to get reclaim moving on as + * many inodes as possible in a short period of time. It kicks itself every few + * seconds, as well as being kicked by the inode cache shrinker when memory + * goes low. It scans as quickly as possible avoiding locked inodes or those + * already being flushed, and once done schedules a future pass. + */ +void +xfs_reclaim_worker( + struct work_struct *work) +{ + struct xfs_mount *mp = container_of(to_delayed_work(work), + struct xfs_mount, m_reclaim_work); + + xfs_reclaim_inodes(mp, SYNC_TRYLOCK); + xfs_reclaim_work_queue(mp); +} + +static void +__xfs_inode_set_reclaim_tag( + struct xfs_perag *pag, + struct xfs_inode *ip) +{ + radix_tree_tag_set(&pag->pag_ici_root, + XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), + XFS_ICI_RECLAIM_TAG); + + if (!pag->pag_ici_reclaimable) { + /* propagate the reclaim tag up into the perag radix tree */ + spin_lock(&ip->i_mount->m_perag_lock); + radix_tree_tag_set(&ip->i_mount->m_perag_tree, + XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), + XFS_ICI_RECLAIM_TAG); + spin_unlock(&ip->i_mount->m_perag_lock); + + /* schedule periodic background inode reclaim */ + xfs_reclaim_work_queue(ip->i_mount); + + trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno, + -1, _RET_IP_); + } + pag->pag_ici_reclaimable++; +} + +/* + * We set the inode flag atomically with the radix tree tag. + * Once we get tag lookups on the radix tree, this inode flag + * can go away. + */ +void +xfs_inode_set_reclaim_tag( + xfs_inode_t *ip) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_perag *pag; + + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); + spin_lock(&pag->pag_ici_lock); + spin_lock(&ip->i_flags_lock); + __xfs_inode_set_reclaim_tag(pag, ip); + __xfs_iflags_set(ip, XFS_IRECLAIMABLE); + spin_unlock(&ip->i_flags_lock); + spin_unlock(&pag->pag_ici_lock); + xfs_perag_put(pag); +} + +STATIC void +__xfs_inode_clear_reclaim( + xfs_perag_t *pag, + xfs_inode_t *ip) +{ + pag->pag_ici_reclaimable--; + if (!pag->pag_ici_reclaimable) { + /* clear the reclaim tag from the perag radix tree */ + spin_lock(&ip->i_mount->m_perag_lock); + radix_tree_tag_clear(&ip->i_mount->m_perag_tree, + XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), + XFS_ICI_RECLAIM_TAG); + spin_unlock(&ip->i_mount->m_perag_lock); + trace_xfs_perag_clear_reclaim(ip->i_mount, pag->pag_agno, + -1, _RET_IP_); + } +} + +STATIC void +__xfs_inode_clear_reclaim_tag( + xfs_mount_t *mp, + xfs_perag_t *pag, + xfs_inode_t *ip) +{ + radix_tree_tag_clear(&pag->pag_ici_root, + XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG); + __xfs_inode_clear_reclaim(pag, ip); +} + +/* + * Grab the inode for reclaim exclusively. + * Return 0 if we grabbed it, non-zero otherwise. + */ +STATIC int +xfs_reclaim_inode_grab( + struct xfs_inode *ip, + int flags) +{ + ASSERT(rcu_read_lock_held()); + + /* quick check for stale RCU freed inode */ + if (!ip->i_ino) + return 1; + + /* + * If we are asked for non-blocking operation, do unlocked checks to + * see if the inode already is being flushed or in reclaim to avoid + * lock traffic. + */ + if ((flags & SYNC_TRYLOCK) && + __xfs_iflags_test(ip, XFS_IFLOCK | XFS_IRECLAIM)) + return 1; + + /* + * The radix tree lock here protects a thread in xfs_iget from racing + * with us starting reclaim on the inode. Once we have the + * XFS_IRECLAIM flag set it will not touch us. + * + * Due to RCU lookup, we may find inodes that have been freed and only + * have XFS_IRECLAIM set. Indeed, we may see reallocated inodes that + * aren't candidates for reclaim at all, so we must check the + * XFS_IRECLAIMABLE is set first before proceeding to reclaim. + */ + spin_lock(&ip->i_flags_lock); + if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) || + __xfs_iflags_test(ip, XFS_IRECLAIM)) { + /* not a reclaim candidate. */ + spin_unlock(&ip->i_flags_lock); + return 1; + } + __xfs_iflags_set(ip, XFS_IRECLAIM); + spin_unlock(&ip->i_flags_lock); + return 0; +} + +/* + * Inodes in different states need to be treated differently. The following + * table lists the inode states and the reclaim actions necessary: + * + * inode state iflush ret required action + * --------------- ---------- --------------- + * bad - reclaim + * shutdown EIO unpin and reclaim + * clean, unpinned 0 reclaim + * stale, unpinned 0 reclaim + * clean, pinned(*) 0 requeue + * stale, pinned EAGAIN requeue + * dirty, async - requeue + * dirty, sync 0 reclaim + * + * (*) dgc: I don't think the clean, pinned state is possible but it gets + * handled anyway given the order of checks implemented. + * + * Also, because we get the flush lock first, we know that any inode that has + * been flushed delwri has had the flush completed by the time we check that + * the inode is clean. + * + * Note that because the inode is flushed delayed write by AIL pushing, the + * flush lock may already be held here and waiting on it can result in very + * long latencies. Hence for sync reclaims, where we wait on the flush lock, + * the caller should push the AIL first before trying to reclaim inodes to + * minimise the amount of time spent waiting. For background relaim, we only + * bother to reclaim clean inodes anyway. + * + * Hence the order of actions after gaining the locks should be: + * bad => reclaim + * shutdown => unpin and reclaim + * pinned, async => requeue + * pinned, sync => unpin + * stale => reclaim + * clean => reclaim + * dirty, async => requeue + * dirty, sync => flush, wait and reclaim + */ +STATIC int +xfs_reclaim_inode( + struct xfs_inode *ip, + struct xfs_perag *pag, + int sync_mode) +{ + struct xfs_buf *bp = NULL; + int error; + +restart: + error = 0; + xfs_ilock(ip, XFS_ILOCK_EXCL); + if (!xfs_iflock_nowait(ip)) { + if (!(sync_mode & SYNC_WAIT)) + goto out; + xfs_iflock(ip); + } + + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { + xfs_iunpin_wait(ip); + xfs_iflush_abort(ip, false); + goto reclaim; + } + if (xfs_ipincount(ip)) { + if (!(sync_mode & SYNC_WAIT)) + goto out_ifunlock; + xfs_iunpin_wait(ip); + } + if (xfs_iflags_test(ip, XFS_ISTALE)) + goto reclaim; + if (xfs_inode_clean(ip)) + goto reclaim; + + /* + * Never flush out dirty data during non-blocking reclaim, as it would + * just contend with AIL pushing trying to do the same job. + */ + if (!(sync_mode & SYNC_WAIT)) + goto out_ifunlock; + + /* + * Now we have an inode that needs flushing. + * + * Note that xfs_iflush will never block on the inode buffer lock, as + * xfs_ifree_cluster() can lock the inode buffer before it locks the + * ip->i_lock, and we are doing the exact opposite here. As a result, + * doing a blocking xfs_imap_to_bp() to get the cluster buffer would + * result in an ABBA deadlock with xfs_ifree_cluster(). + * + * As xfs_ifree_cluser() must gather all inodes that are active in the + * cache to mark them stale, if we hit this case we don't actually want + * to do IO here - we want the inode marked stale so we can simply + * reclaim it. Hence if we get an EAGAIN error here, just unlock the + * inode, back off and try again. Hopefully the next pass through will + * see the stale flag set on the inode. + */ + error = xfs_iflush(ip, &bp); + if (error == -EAGAIN) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + /* backoff longer than in xfs_ifree_cluster */ + delay(2); + goto restart; + } + + if (!error) { + error = xfs_bwrite(bp); + xfs_buf_relse(bp); + } + + xfs_iflock(ip); +reclaim: + xfs_ifunlock(ip); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + + XFS_STATS_INC(xs_ig_reclaims); + /* + * Remove the inode from the per-AG radix tree. + * + * Because radix_tree_delete won't complain even if the item was never + * added to the tree assert that it's been there before to catch + * problems with the inode life time early on. + */ + spin_lock(&pag->pag_ici_lock); + if (!radix_tree_delete(&pag->pag_ici_root, + XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino))) + ASSERT(0); + __xfs_inode_clear_reclaim(pag, ip); + spin_unlock(&pag->pag_ici_lock); + + /* + * Here we do an (almost) spurious inode lock in order to coordinate + * with inode cache radix tree lookups. This is because the lookup + * can reference the inodes in the cache without taking references. + * + * We make that OK here by ensuring that we wait until the inode is + * unlocked after the lookup before we go ahead and free it. + */ + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_qm_dqdetach(ip); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + + xfs_inode_free(ip); + return error; + +out_ifunlock: + xfs_ifunlock(ip); +out: + xfs_iflags_clear(ip, XFS_IRECLAIM); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + /* + * We could return -EAGAIN here to make reclaim rescan the inode tree in + * a short while. However, this just burns CPU time scanning the tree + * waiting for IO to complete and the reclaim work never goes back to + * the idle state. Instead, return 0 to let the next scheduled + * background reclaim attempt to reclaim the inode again. + */ + return 0; +} + +/* + * Walk the AGs and reclaim the inodes in them. Even if the filesystem is + * corrupted, we still want to try to reclaim all the inodes. If we don't, + * then a shut down during filesystem unmount reclaim walk leak all the + * unreclaimed inodes. + */ +STATIC int +xfs_reclaim_inodes_ag( + struct xfs_mount *mp, + int flags, + int *nr_to_scan) +{ + struct xfs_perag *pag; + int error = 0; + int last_error = 0; + xfs_agnumber_t ag; + int trylock = flags & SYNC_TRYLOCK; + int skipped; + +restart: + ag = 0; + skipped = 0; + while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) { + unsigned long first_index = 0; + int done = 0; + int nr_found = 0; + + ag = pag->pag_agno + 1; + + if (trylock) { + if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) { + skipped++; + xfs_perag_put(pag); + continue; + } + first_index = pag->pag_ici_reclaim_cursor; + } else + mutex_lock(&pag->pag_ici_reclaim_lock); + + do { + struct xfs_inode *batch[XFS_LOOKUP_BATCH]; + int i; + + rcu_read_lock(); + nr_found = radix_tree_gang_lookup_tag( + &pag->pag_ici_root, + (void **)batch, first_index, + XFS_LOOKUP_BATCH, + XFS_ICI_RECLAIM_TAG); + if (!nr_found) { + done = 1; + rcu_read_unlock(); + break; + } + + /* + * Grab the inodes before we drop the lock. if we found + * nothing, nr == 0 and the loop will be skipped. + */ + for (i = 0; i < nr_found; i++) { + struct xfs_inode *ip = batch[i]; + + if (done || xfs_reclaim_inode_grab(ip, flags)) + batch[i] = NULL; + + /* + * Update the index for the next lookup. Catch + * overflows into the next AG range which can + * occur if we have inodes in the last block of + * the AG and we are currently pointing to the + * last inode. + * + * Because we may see inodes that are from the + * wrong AG due to RCU freeing and + * reallocation, only update the index if it + * lies in this AG. It was a race that lead us + * to see this inode, so another lookup from + * the same index will not find it again. + */ + if (XFS_INO_TO_AGNO(mp, ip->i_ino) != + pag->pag_agno) + continue; + first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); + if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) + done = 1; + } + + /* unlock now we've grabbed the inodes. */ + rcu_read_unlock(); + + for (i = 0; i < nr_found; i++) { + if (!batch[i]) + continue; + error = xfs_reclaim_inode(batch[i], pag, flags); + if (error && last_error != -EFSCORRUPTED) + last_error = error; + } + + *nr_to_scan -= XFS_LOOKUP_BATCH; + + cond_resched(); + + } while (nr_found && !done && *nr_to_scan > 0); + + if (trylock && !done) + pag->pag_ici_reclaim_cursor = first_index; + else + pag->pag_ici_reclaim_cursor = 0; + mutex_unlock(&pag->pag_ici_reclaim_lock); + xfs_perag_put(pag); + } + + /* + * if we skipped any AG, and we still have scan count remaining, do + * another pass this time using blocking reclaim semantics (i.e + * waiting on the reclaim locks and ignoring the reclaim cursors). This + * ensure that when we get more reclaimers than AGs we block rather + * than spin trying to execute reclaim. + */ + if (skipped && (flags & SYNC_WAIT) && *nr_to_scan > 0) { + trylock = 0; + goto restart; + } + return last_error; +} + +int +xfs_reclaim_inodes( + xfs_mount_t *mp, + int mode) +{ + int nr_to_scan = INT_MAX; + + return xfs_reclaim_inodes_ag(mp, mode, &nr_to_scan); +} + +/* + * Scan a certain number of inodes for reclaim. + * + * When called we make sure that there is a background (fast) inode reclaim in + * progress, while we will throttle the speed of reclaim via doing synchronous + * reclaim of inodes. That means if we come across dirty inodes, we wait for + * them to be cleaned, which we hope will not be very long due to the + * background walker having already kicked the IO off on those dirty inodes. + */ +long +xfs_reclaim_inodes_nr( + struct xfs_mount *mp, + int nr_to_scan) +{ + /* kick background reclaimer and push the AIL */ + xfs_reclaim_work_queue(mp); + xfs_ail_push_all(mp->m_ail); + + return xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan); +} + +/* + * Return the number of reclaimable inodes in the filesystem for + * the shrinker to determine how much to reclaim. + */ +int +xfs_reclaim_inodes_count( + struct xfs_mount *mp) +{ + struct xfs_perag *pag; + xfs_agnumber_t ag = 0; + int reclaimable = 0; + + while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) { + ag = pag->pag_agno + 1; + reclaimable += pag->pag_ici_reclaimable; + xfs_perag_put(pag); + } + return reclaimable; +} + +STATIC int +xfs_inode_match_id( + struct xfs_inode *ip, + struct xfs_eofblocks *eofb) +{ + if ((eofb->eof_flags & XFS_EOF_FLAGS_UID) && + !uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid)) + return 0; + + if ((eofb->eof_flags & XFS_EOF_FLAGS_GID) && + !gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid)) + return 0; + + if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) && + xfs_get_projid(ip) != eofb->eof_prid) + return 0; + + return 1; +} + +/* + * A union-based inode filtering algorithm. Process the inode if any of the + * criteria match. This is for global/internal scans only. + */ +STATIC int +xfs_inode_match_id_union( + struct xfs_inode *ip, + struct xfs_eofblocks *eofb) +{ + if ((eofb->eof_flags & XFS_EOF_FLAGS_UID) && + uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid)) + return 1; + + if ((eofb->eof_flags & XFS_EOF_FLAGS_GID) && + gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid)) + return 1; + + if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) && + xfs_get_projid(ip) == eofb->eof_prid) + return 1; + + return 0; +} + +STATIC int +xfs_inode_free_eofblocks( + struct xfs_inode *ip, + int flags, + void *args) +{ + int ret; + struct xfs_eofblocks *eofb = args; + bool need_iolock = true; + int match; + + ASSERT(!eofb || (eofb && eofb->eof_scan_owner != 0)); + + if (!xfs_can_free_eofblocks(ip, false)) { + /* inode could be preallocated or append-only */ + trace_xfs_inode_free_eofblocks_invalid(ip); + xfs_inode_clear_eofblocks_tag(ip); + return 0; + } + + /* + * If the mapping is dirty the operation can block and wait for some + * time. Unless we are waiting, skip it. + */ + if (!(flags & SYNC_WAIT) && + mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY)) + return 0; + + if (eofb) { + if (eofb->eof_flags & XFS_EOF_FLAGS_UNION) + match = xfs_inode_match_id_union(ip, eofb); + else + match = xfs_inode_match_id(ip, eofb); + if (!match) + return 0; + + /* skip the inode if the file size is too small */ + if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE && + XFS_ISIZE(ip) < eofb->eof_min_file_size) + return 0; + + /* + * A scan owner implies we already hold the iolock. Skip it in + * xfs_free_eofblocks() to avoid deadlock. This also eliminates + * the possibility of EAGAIN being returned. + */ + if (eofb->eof_scan_owner == ip->i_ino) + need_iolock = false; + } + + ret = xfs_free_eofblocks(ip->i_mount, ip, need_iolock); + + /* don't revisit the inode if we're not waiting */ + if (ret == -EAGAIN && !(flags & SYNC_WAIT)) + ret = 0; + + return ret; +} + +int +xfs_icache_free_eofblocks( + struct xfs_mount *mp, + struct xfs_eofblocks *eofb) +{ + int flags = SYNC_TRYLOCK; + + if (eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC)) + flags = SYNC_WAIT; + + return xfs_inode_ag_iterator_tag(mp, xfs_inode_free_eofblocks, flags, + eofb, XFS_ICI_EOFBLOCKS_TAG); +} + +/* + * Run eofblocks scans on the quotas applicable to the inode. For inodes with + * multiple quotas, we don't know exactly which quota caused an allocation + * failure. We make a best effort by including each quota under low free space + * conditions (less than 1% free space) in the scan. + */ +int +xfs_inode_free_quota_eofblocks( + struct xfs_inode *ip) +{ + int scan = 0; + struct xfs_eofblocks eofb = {0}; + struct xfs_dquot *dq; + + ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); + + /* + * Set the scan owner to avoid a potential livelock. Otherwise, the scan + * can repeatedly trylock on the inode we're currently processing. We + * run a sync scan to increase effectiveness and use the union filter to + * cover all applicable quotas in a single scan. + */ + eofb.eof_scan_owner = ip->i_ino; + eofb.eof_flags = XFS_EOF_FLAGS_UNION|XFS_EOF_FLAGS_SYNC; + + if (XFS_IS_UQUOTA_ENFORCED(ip->i_mount)) { + dq = xfs_inode_dquot(ip, XFS_DQ_USER); + if (dq && xfs_dquot_lowsp(dq)) { + eofb.eof_uid = VFS_I(ip)->i_uid; + eofb.eof_flags |= XFS_EOF_FLAGS_UID; + scan = 1; + } + } + + if (XFS_IS_GQUOTA_ENFORCED(ip->i_mount)) { + dq = xfs_inode_dquot(ip, XFS_DQ_GROUP); + if (dq && xfs_dquot_lowsp(dq)) { + eofb.eof_gid = VFS_I(ip)->i_gid; + eofb.eof_flags |= XFS_EOF_FLAGS_GID; + scan = 1; + } + } + + if (scan) + xfs_icache_free_eofblocks(ip->i_mount, &eofb); + + return scan; +} + +void +xfs_inode_set_eofblocks_tag( + xfs_inode_t *ip) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_perag *pag; + int tagged; + + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); + spin_lock(&pag->pag_ici_lock); + trace_xfs_inode_set_eofblocks_tag(ip); + + tagged = radix_tree_tagged(&pag->pag_ici_root, + XFS_ICI_EOFBLOCKS_TAG); + radix_tree_tag_set(&pag->pag_ici_root, + XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), + XFS_ICI_EOFBLOCKS_TAG); + if (!tagged) { + /* propagate the eofblocks tag up into the perag radix tree */ + spin_lock(&ip->i_mount->m_perag_lock); + radix_tree_tag_set(&ip->i_mount->m_perag_tree, + XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), + XFS_ICI_EOFBLOCKS_TAG); + spin_unlock(&ip->i_mount->m_perag_lock); + + /* kick off background trimming */ + xfs_queue_eofblocks(ip->i_mount); + + trace_xfs_perag_set_eofblocks(ip->i_mount, pag->pag_agno, + -1, _RET_IP_); + } + + spin_unlock(&pag->pag_ici_lock); + xfs_perag_put(pag); +} + +void +xfs_inode_clear_eofblocks_tag( + xfs_inode_t *ip) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_perag *pag; + + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); + spin_lock(&pag->pag_ici_lock); + trace_xfs_inode_clear_eofblocks_tag(ip); + + radix_tree_tag_clear(&pag->pag_ici_root, + XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), + XFS_ICI_EOFBLOCKS_TAG); + if (!radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_EOFBLOCKS_TAG)) { + /* clear the eofblocks tag from the perag radix tree */ + spin_lock(&ip->i_mount->m_perag_lock); + radix_tree_tag_clear(&ip->i_mount->m_perag_tree, + XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), + XFS_ICI_EOFBLOCKS_TAG); + spin_unlock(&ip->i_mount->m_perag_lock); + trace_xfs_perag_clear_eofblocks(ip->i_mount, pag->pag_agno, + -1, _RET_IP_); + } + + spin_unlock(&pag->pag_ici_lock); + xfs_perag_put(pag); +} + diff --git a/kernel/fs/xfs/xfs_icache.h b/kernel/fs/xfs/xfs_icache.h new file mode 100644 index 000000000..62f1f91c3 --- /dev/null +++ b/kernel/fs/xfs/xfs_icache.h @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef XFS_SYNC_H +#define XFS_SYNC_H 1 + +struct xfs_mount; +struct xfs_perag; + +struct xfs_eofblocks { + __u32 eof_flags; + kuid_t eof_uid; + kgid_t eof_gid; + prid_t eof_prid; + __u64 eof_min_file_size; + xfs_ino_t eof_scan_owner; +}; + +#define SYNC_WAIT 0x0001 /* wait for i/o to complete */ +#define SYNC_TRYLOCK 0x0002 /* only try to lock inodes */ + +/* + * tags for inode radix tree + */ +#define XFS_ICI_NO_TAG (-1) /* special flag for an untagged lookup + in xfs_inode_ag_iterator */ +#define XFS_ICI_RECLAIM_TAG 0 /* inode is to be reclaimed */ +#define XFS_ICI_EOFBLOCKS_TAG 1 /* inode has blocks beyond EOF */ + +/* + * Flags for xfs_iget() + */ +#define XFS_IGET_CREATE 0x1 +#define XFS_IGET_UNTRUSTED 0x2 +#define XFS_IGET_DONTCACHE 0x4 + +int xfs_iget(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino, + uint flags, uint lock_flags, xfs_inode_t **ipp); + +/* recovery needs direct inode allocation capability */ +struct xfs_inode * xfs_inode_alloc(struct xfs_mount *mp, xfs_ino_t ino); +void xfs_inode_free(struct xfs_inode *ip); + +void xfs_reclaim_worker(struct work_struct *work); + +int xfs_reclaim_inodes(struct xfs_mount *mp, int mode); +int xfs_reclaim_inodes_count(struct xfs_mount *mp); +long xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan); + +void xfs_inode_set_reclaim_tag(struct xfs_inode *ip); + +void xfs_inode_set_eofblocks_tag(struct xfs_inode *ip); +void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip); +int xfs_icache_free_eofblocks(struct xfs_mount *, struct xfs_eofblocks *); +int xfs_inode_free_quota_eofblocks(struct xfs_inode *ip); +void xfs_eofblocks_worker(struct work_struct *); + +int xfs_inode_ag_iterator(struct xfs_mount *mp, + int (*execute)(struct xfs_inode *ip, int flags, void *args), + int flags, void *args); +int xfs_inode_ag_iterator_tag(struct xfs_mount *mp, + int (*execute)(struct xfs_inode *ip, int flags, void *args), + int flags, void *args, int tag); + +static inline int +xfs_fs_eofblocks_from_user( + struct xfs_fs_eofblocks *src, + struct xfs_eofblocks *dst) +{ + if (src->eof_version != XFS_EOFBLOCKS_VERSION) + return -EINVAL; + + if (src->eof_flags & ~XFS_EOF_FLAGS_VALID) + return -EINVAL; + + if (memchr_inv(&src->pad32, 0, sizeof(src->pad32)) || + memchr_inv(src->pad64, 0, sizeof(src->pad64))) + return -EINVAL; + + dst->eof_flags = src->eof_flags; + dst->eof_prid = src->eof_prid; + dst->eof_min_file_size = src->eof_min_file_size; + dst->eof_scan_owner = NULLFSINO; + + dst->eof_uid = INVALID_UID; + if (src->eof_flags & XFS_EOF_FLAGS_UID) { + dst->eof_uid = make_kuid(current_user_ns(), src->eof_uid); + if (!uid_valid(dst->eof_uid)) + return -EINVAL; + } + + dst->eof_gid = INVALID_GID; + if (src->eof_flags & XFS_EOF_FLAGS_GID) { + dst->eof_gid = make_kgid(current_user_ns(), src->eof_gid); + if (!gid_valid(dst->eof_gid)) + return -EINVAL; + } + return 0; +} + +#endif diff --git a/kernel/fs/xfs/xfs_icreate_item.c b/kernel/fs/xfs/xfs_icreate_item.c new file mode 100644 index 000000000..d45ca72af --- /dev/null +++ b/kernel/fs/xfs/xfs_icreate_item.c @@ -0,0 +1,188 @@ +/* + * Copyright (c) 2008-2010, 2013 Dave Chinner + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_mount.h" +#include "xfs_trans.h" +#include "xfs_trans_priv.h" +#include "xfs_error.h" +#include "xfs_icreate_item.h" +#include "xfs_log.h" + +kmem_zone_t *xfs_icreate_zone; /* inode create item zone */ + +static inline struct xfs_icreate_item *ICR_ITEM(struct xfs_log_item *lip) +{ + return container_of(lip, struct xfs_icreate_item, ic_item); +} + +/* + * This returns the number of iovecs needed to log the given inode item. + * + * We only need one iovec for the icreate log structure. + */ +STATIC void +xfs_icreate_item_size( + struct xfs_log_item *lip, + int *nvecs, + int *nbytes) +{ + *nvecs += 1; + *nbytes += sizeof(struct xfs_icreate_log); +} + +/* + * This is called to fill in the vector of log iovecs for the + * given inode create log item. + */ +STATIC void +xfs_icreate_item_format( + struct xfs_log_item *lip, + struct xfs_log_vec *lv) +{ + struct xfs_icreate_item *icp = ICR_ITEM(lip); + struct xfs_log_iovec *vecp = NULL; + + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ICREATE, + &icp->ic_format, + sizeof(struct xfs_icreate_log)); +} + + +/* Pinning has no meaning for the create item, so just return. */ +STATIC void +xfs_icreate_item_pin( + struct xfs_log_item *lip) +{ +} + + +/* pinning has no meaning for the create item, so just return. */ +STATIC void +xfs_icreate_item_unpin( + struct xfs_log_item *lip, + int remove) +{ +} + +STATIC void +xfs_icreate_item_unlock( + struct xfs_log_item *lip) +{ + struct xfs_icreate_item *icp = ICR_ITEM(lip); + + if (icp->ic_item.li_flags & XFS_LI_ABORTED) + kmem_zone_free(xfs_icreate_zone, icp); + return; +} + +/* + * Because we have ordered buffers being tracked in the AIL for the inode + * creation, we don't need the create item after this. Hence we can free + * the log item and return -1 to tell the caller we're done with the item. + */ +STATIC xfs_lsn_t +xfs_icreate_item_committed( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ + struct xfs_icreate_item *icp = ICR_ITEM(lip); + + kmem_zone_free(xfs_icreate_zone, icp); + return (xfs_lsn_t)-1; +} + +/* item can never get into the AIL */ +STATIC uint +xfs_icreate_item_push( + struct xfs_log_item *lip, + struct list_head *buffer_list) +{ + ASSERT(0); + return XFS_ITEM_SUCCESS; +} + +/* Ordered buffers do the dependency tracking here, so this does nothing. */ +STATIC void +xfs_icreate_item_committing( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ +} + +/* + * This is the ops vector shared by all buf log items. + */ +static struct xfs_item_ops xfs_icreate_item_ops = { + .iop_size = xfs_icreate_item_size, + .iop_format = xfs_icreate_item_format, + .iop_pin = xfs_icreate_item_pin, + .iop_unpin = xfs_icreate_item_unpin, + .iop_push = xfs_icreate_item_push, + .iop_unlock = xfs_icreate_item_unlock, + .iop_committed = xfs_icreate_item_committed, + .iop_committing = xfs_icreate_item_committing, +}; + + +/* + * Initialize the inode log item for a newly allocated (in-core) inode. + * + * Inode extents can only reside within an AG. Hence specify the starting + * block for the inode chunk by offset within an AG as well as the + * length of the allocated extent. + * + * This joins the item to the transaction and marks it dirty so + * that we don't need a separate call to do this, nor does the + * caller need to know anything about the icreate item. + */ +void +xfs_icreate_log( + struct xfs_trans *tp, + xfs_agnumber_t agno, + xfs_agblock_t agbno, + unsigned int count, + unsigned int inode_size, + xfs_agblock_t length, + unsigned int generation) +{ + struct xfs_icreate_item *icp; + + icp = kmem_zone_zalloc(xfs_icreate_zone, KM_SLEEP); + + xfs_log_item_init(tp->t_mountp, &icp->ic_item, XFS_LI_ICREATE, + &xfs_icreate_item_ops); + + icp->ic_format.icl_type = XFS_LI_ICREATE; + icp->ic_format.icl_size = 1; /* single vector */ + icp->ic_format.icl_ag = cpu_to_be32(agno); + icp->ic_format.icl_agbno = cpu_to_be32(agbno); + icp->ic_format.icl_count = cpu_to_be32(count); + icp->ic_format.icl_isize = cpu_to_be32(inode_size); + icp->ic_format.icl_length = cpu_to_be32(length); + icp->ic_format.icl_gen = cpu_to_be32(generation); + + xfs_trans_add_item(tp, &icp->ic_item); + tp->t_flags |= XFS_TRANS_DIRTY; + icp->ic_item.li_desc->lid_flags |= XFS_LID_DIRTY; +} diff --git a/kernel/fs/xfs/xfs_icreate_item.h b/kernel/fs/xfs/xfs_icreate_item.h new file mode 100644 index 000000000..59e89f87c --- /dev/null +++ b/kernel/fs/xfs/xfs_icreate_item.h @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2008-2010, Dave Chinner + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef XFS_ICREATE_ITEM_H +#define XFS_ICREATE_ITEM_H 1 + +/* in memory log item structure */ +struct xfs_icreate_item { + struct xfs_log_item ic_item; + struct xfs_icreate_log ic_format; +}; + +extern kmem_zone_t *xfs_icreate_zone; /* inode create item zone */ + +void xfs_icreate_log(struct xfs_trans *tp, xfs_agnumber_t agno, + xfs_agblock_t agbno, unsigned int count, + unsigned int inode_size, xfs_agblock_t length, + unsigned int generation); + +#endif /* XFS_ICREATE_ITEM_H */ diff --git a/kernel/fs/xfs/xfs_inode.c b/kernel/fs/xfs/xfs_inode.c new file mode 100644 index 000000000..539a85fdd --- /dev/null +++ b/kernel/fs/xfs/xfs_inode.c @@ -0,0 +1,3606 @@ +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include + +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_dir2.h" +#include "xfs_attr_sf.h" +#include "xfs_attr.h" +#include "xfs_trans_space.h" +#include "xfs_trans.h" +#include "xfs_buf_item.h" +#include "xfs_inode_item.h" +#include "xfs_ialloc.h" +#include "xfs_bmap.h" +#include "xfs_bmap_util.h" +#include "xfs_error.h" +#include "xfs_quota.h" +#include "xfs_filestream.h" +#include "xfs_cksum.h" +#include "xfs_trace.h" +#include "xfs_icache.h" +#include "xfs_symlink.h" +#include "xfs_trans_priv.h" +#include "xfs_log.h" +#include "xfs_bmap_btree.h" + +kmem_zone_t *xfs_inode_zone; + +/* + * Used in xfs_itruncate_extents(). This is the maximum number of extents + * freed from a file in a single transaction. + */ +#define XFS_ITRUNC_MAX_EXTENTS 2 + +STATIC int xfs_iflush_int(xfs_inode_t *, xfs_buf_t *); + +STATIC int xfs_iunlink_remove(xfs_trans_t *, xfs_inode_t *); + +/* + * helper function to extract extent size hint from inode + */ +xfs_extlen_t +xfs_get_extsz_hint( + struct xfs_inode *ip) +{ + if ((ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) && ip->i_d.di_extsize) + return ip->i_d.di_extsize; + if (XFS_IS_REALTIME_INODE(ip)) + return ip->i_mount->m_sb.sb_rextsize; + return 0; +} + +/* + * These two are wrapper routines around the xfs_ilock() routine used to + * centralize some grungy code. They are used in places that wish to lock the + * inode solely for reading the extents. The reason these places can't just + * call xfs_ilock(ip, XFS_ILOCK_SHARED) is that the inode lock also guards to + * bringing in of the extents from disk for a file in b-tree format. If the + * inode is in b-tree format, then we need to lock the inode exclusively until + * the extents are read in. Locking it exclusively all the time would limit + * our parallelism unnecessarily, though. What we do instead is check to see + * if the extents have been read in yet, and only lock the inode exclusively + * if they have not. + * + * The functions return a value which should be given to the corresponding + * xfs_iunlock() call. + */ +uint +xfs_ilock_data_map_shared( + struct xfs_inode *ip) +{ + uint lock_mode = XFS_ILOCK_SHARED; + + if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE && + (ip->i_df.if_flags & XFS_IFEXTENTS) == 0) + lock_mode = XFS_ILOCK_EXCL; + xfs_ilock(ip, lock_mode); + return lock_mode; +} + +uint +xfs_ilock_attr_map_shared( + struct xfs_inode *ip) +{ + uint lock_mode = XFS_ILOCK_SHARED; + + if (ip->i_d.di_aformat == XFS_DINODE_FMT_BTREE && + (ip->i_afp->if_flags & XFS_IFEXTENTS) == 0) + lock_mode = XFS_ILOCK_EXCL; + xfs_ilock(ip, lock_mode); + return lock_mode; +} + +/* + * The xfs inode contains 3 multi-reader locks: the i_iolock the i_mmap_lock and + * the i_lock. This routine allows various combinations of the locks to be + * obtained. + * + * The 3 locks should always be ordered so that the IO lock is obtained first, + * the mmap lock second and the ilock last in order to prevent deadlock. + * + * Basic locking order: + * + * i_iolock -> i_mmap_lock -> page_lock -> i_ilock + * + * mmap_sem locking order: + * + * i_iolock -> page lock -> mmap_sem + * mmap_sem -> i_mmap_lock -> page_lock + * + * The difference in mmap_sem locking order mean that we cannot hold the + * i_mmap_lock over syscall based read(2)/write(2) based IO. These IO paths can + * fault in pages during copy in/out (for buffered IO) or require the mmap_sem + * in get_user_pages() to map the user pages into the kernel address space for + * direct IO. Similarly the i_iolock cannot be taken inside a page fault because + * page faults already hold the mmap_sem. + * + * Hence to serialise fully against both syscall and mmap based IO, we need to + * take both the i_iolock and the i_mmap_lock. These locks should *only* be both + * taken in places where we need to invalidate the page cache in a race + * free manner (e.g. truncate, hole punch and other extent manipulation + * functions). + */ +void +xfs_ilock( + xfs_inode_t *ip, + uint lock_flags) +{ + trace_xfs_ilock(ip, lock_flags, _RET_IP_); + + /* + * You can't set both SHARED and EXCL for the same lock, + * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED, + * and XFS_ILOCK_EXCL are valid values to set in lock_flags. + */ + ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != + (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); + ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) != + (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)); + ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != + (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); + ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); + + if (lock_flags & XFS_IOLOCK_EXCL) + mrupdate_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags)); + else if (lock_flags & XFS_IOLOCK_SHARED) + mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags)); + + if (lock_flags & XFS_MMAPLOCK_EXCL) + mrupdate_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags)); + else if (lock_flags & XFS_MMAPLOCK_SHARED) + mraccess_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags)); + + if (lock_flags & XFS_ILOCK_EXCL) + mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags)); + else if (lock_flags & XFS_ILOCK_SHARED) + mraccess_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags)); +} + +/* + * This is just like xfs_ilock(), except that the caller + * is guaranteed not to sleep. It returns 1 if it gets + * the requested locks and 0 otherwise. If the IO lock is + * obtained but the inode lock cannot be, then the IO lock + * is dropped before returning. + * + * ip -- the inode being locked + * lock_flags -- this parameter indicates the inode's locks to be + * to be locked. See the comment for xfs_ilock() for a list + * of valid values. + */ +int +xfs_ilock_nowait( + xfs_inode_t *ip, + uint lock_flags) +{ + trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_); + + /* + * You can't set both SHARED and EXCL for the same lock, + * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED, + * and XFS_ILOCK_EXCL are valid values to set in lock_flags. + */ + ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != + (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); + ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) != + (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)); + ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != + (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); + ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); + + if (lock_flags & XFS_IOLOCK_EXCL) { + if (!mrtryupdate(&ip->i_iolock)) + goto out; + } else if (lock_flags & XFS_IOLOCK_SHARED) { + if (!mrtryaccess(&ip->i_iolock)) + goto out; + } + + if (lock_flags & XFS_MMAPLOCK_EXCL) { + if (!mrtryupdate(&ip->i_mmaplock)) + goto out_undo_iolock; + } else if (lock_flags & XFS_MMAPLOCK_SHARED) { + if (!mrtryaccess(&ip->i_mmaplock)) + goto out_undo_iolock; + } + + if (lock_flags & XFS_ILOCK_EXCL) { + if (!mrtryupdate(&ip->i_lock)) + goto out_undo_mmaplock; + } else if (lock_flags & XFS_ILOCK_SHARED) { + if (!mrtryaccess(&ip->i_lock)) + goto out_undo_mmaplock; + } + return 1; + +out_undo_mmaplock: + if (lock_flags & XFS_MMAPLOCK_EXCL) + mrunlock_excl(&ip->i_mmaplock); + else if (lock_flags & XFS_MMAPLOCK_SHARED) + mrunlock_shared(&ip->i_mmaplock); +out_undo_iolock: + if (lock_flags & XFS_IOLOCK_EXCL) + mrunlock_excl(&ip->i_iolock); + else if (lock_flags & XFS_IOLOCK_SHARED) + mrunlock_shared(&ip->i_iolock); +out: + return 0; +} + +/* + * xfs_iunlock() is used to drop the inode locks acquired with + * xfs_ilock() and xfs_ilock_nowait(). The caller must pass + * in the flags given to xfs_ilock() or xfs_ilock_nowait() so + * that we know which locks to drop. + * + * ip -- the inode being unlocked + * lock_flags -- this parameter indicates the inode's locks to be + * to be unlocked. See the comment for xfs_ilock() for a list + * of valid values for this parameter. + * + */ +void +xfs_iunlock( + xfs_inode_t *ip, + uint lock_flags) +{ + /* + * You can't set both SHARED and EXCL for the same lock, + * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED, + * and XFS_ILOCK_EXCL are valid values to set in lock_flags. + */ + ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != + (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); + ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) != + (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)); + ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != + (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); + ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); + ASSERT(lock_flags != 0); + + if (lock_flags & XFS_IOLOCK_EXCL) + mrunlock_excl(&ip->i_iolock); + else if (lock_flags & XFS_IOLOCK_SHARED) + mrunlock_shared(&ip->i_iolock); + + if (lock_flags & XFS_MMAPLOCK_EXCL) + mrunlock_excl(&ip->i_mmaplock); + else if (lock_flags & XFS_MMAPLOCK_SHARED) + mrunlock_shared(&ip->i_mmaplock); + + if (lock_flags & XFS_ILOCK_EXCL) + mrunlock_excl(&ip->i_lock); + else if (lock_flags & XFS_ILOCK_SHARED) + mrunlock_shared(&ip->i_lock); + + trace_xfs_iunlock(ip, lock_flags, _RET_IP_); +} + +/* + * give up write locks. the i/o lock cannot be held nested + * if it is being demoted. + */ +void +xfs_ilock_demote( + xfs_inode_t *ip, + uint lock_flags) +{ + ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL|XFS_ILOCK_EXCL)); + ASSERT((lock_flags & + ~(XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL|XFS_ILOCK_EXCL)) == 0); + + if (lock_flags & XFS_ILOCK_EXCL) + mrdemote(&ip->i_lock); + if (lock_flags & XFS_MMAPLOCK_EXCL) + mrdemote(&ip->i_mmaplock); + if (lock_flags & XFS_IOLOCK_EXCL) + mrdemote(&ip->i_iolock); + + trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_); +} + +#if defined(DEBUG) || defined(XFS_WARN) +int +xfs_isilocked( + xfs_inode_t *ip, + uint lock_flags) +{ + if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) { + if (!(lock_flags & XFS_ILOCK_SHARED)) + return !!ip->i_lock.mr_writer; + return rwsem_is_locked(&ip->i_lock.mr_lock); + } + + if (lock_flags & (XFS_MMAPLOCK_EXCL|XFS_MMAPLOCK_SHARED)) { + if (!(lock_flags & XFS_MMAPLOCK_SHARED)) + return !!ip->i_mmaplock.mr_writer; + return rwsem_is_locked(&ip->i_mmaplock.mr_lock); + } + + if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) { + if (!(lock_flags & XFS_IOLOCK_SHARED)) + return !!ip->i_iolock.mr_writer; + return rwsem_is_locked(&ip->i_iolock.mr_lock); + } + + ASSERT(0); + return 0; +} +#endif + +#ifdef DEBUG +int xfs_locked_n; +int xfs_small_retries; +int xfs_middle_retries; +int xfs_lots_retries; +int xfs_lock_delays; +#endif + +/* + * Bump the subclass so xfs_lock_inodes() acquires each lock with a different + * value. This shouldn't be called for page fault locking, but we also need to + * ensure we don't overrun the number of lockdep subclasses for the iolock or + * mmaplock as that is limited to 12 by the mmap lock lockdep annotations. + */ +static inline int +xfs_lock_inumorder(int lock_mode, int subclass) +{ + if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) { + ASSERT(subclass + XFS_LOCK_INUMORDER < + (1 << (XFS_MMAPLOCK_SHIFT - XFS_IOLOCK_SHIFT))); + lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_IOLOCK_SHIFT; + } + + if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) { + ASSERT(subclass + XFS_LOCK_INUMORDER < + (1 << (XFS_ILOCK_SHIFT - XFS_MMAPLOCK_SHIFT))); + lock_mode |= (subclass + XFS_LOCK_INUMORDER) << + XFS_MMAPLOCK_SHIFT; + } + + if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) + lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_ILOCK_SHIFT; + + return lock_mode; +} + +/* + * The following routine will lock n inodes in exclusive mode. We assume the + * caller calls us with the inodes in i_ino order. + * + * We need to detect deadlock where an inode that we lock is in the AIL and we + * start waiting for another inode that is locked by a thread in a long running + * transaction (such as truncate). This can result in deadlock since the long + * running trans might need to wait for the inode we just locked in order to + * push the tail and free space in the log. + */ +void +xfs_lock_inodes( + xfs_inode_t **ips, + int inodes, + uint lock_mode) +{ + int attempts = 0, i, j, try_lock; + xfs_log_item_t *lp; + + /* currently supports between 2 and 5 inodes */ + ASSERT(ips && inodes >= 2 && inodes <= 5); + + try_lock = 0; + i = 0; +again: + for (; i < inodes; i++) { + ASSERT(ips[i]); + + if (i && (ips[i] == ips[i - 1])) /* Already locked */ + continue; + + /* + * If try_lock is not set yet, make sure all locked inodes are + * not in the AIL. If any are, set try_lock to be used later. + */ + if (!try_lock) { + for (j = (i - 1); j >= 0 && !try_lock; j--) { + lp = (xfs_log_item_t *)ips[j]->i_itemp; + if (lp && (lp->li_flags & XFS_LI_IN_AIL)) + try_lock++; + } + } + + /* + * If any of the previous locks we have locked is in the AIL, + * we must TRY to get the second and subsequent locks. If + * we can't get any, we must release all we have + * and try again. + */ + if (!try_lock) { + xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i)); + continue; + } + + /* try_lock means we have an inode locked that is in the AIL. */ + ASSERT(i != 0); + if (xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i))) + continue; + + /* + * Unlock all previous guys and try again. xfs_iunlock will try + * to push the tail if the inode is in the AIL. + */ + attempts++; + for (j = i - 1; j >= 0; j--) { + /* + * Check to see if we've already unlocked this one. Not + * the first one going back, and the inode ptr is the + * same. + */ + if (j != (i - 1) && ips[j] == ips[j + 1]) + continue; + + xfs_iunlock(ips[j], lock_mode); + } + + if ((attempts % 5) == 0) { + delay(1); /* Don't just spin the CPU */ +#ifdef DEBUG + xfs_lock_delays++; +#endif + } + i = 0; + try_lock = 0; + goto again; + } + +#ifdef DEBUG + if (attempts) { + if (attempts < 5) xfs_small_retries++; + else if (attempts < 100) xfs_middle_retries++; + else xfs_lots_retries++; + } else { + xfs_locked_n++; + } +#endif +} + +/* + * xfs_lock_two_inodes() can only be used to lock one type of lock at a time - + * the iolock, the mmaplock or the ilock, but not more than one at a time. If we + * lock more than one at a time, lockdep will report false positives saying we + * have violated locking orders. + */ +void +xfs_lock_two_inodes( + xfs_inode_t *ip0, + xfs_inode_t *ip1, + uint lock_mode) +{ + xfs_inode_t *temp; + int attempts = 0; + xfs_log_item_t *lp; + + if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) { + ASSERT(!(lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL))); + ASSERT(!(lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))); + } else if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) + ASSERT(!(lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))); + + ASSERT(ip0->i_ino != ip1->i_ino); + + if (ip0->i_ino > ip1->i_ino) { + temp = ip0; + ip0 = ip1; + ip1 = temp; + } + + again: + xfs_ilock(ip0, xfs_lock_inumorder(lock_mode, 0)); + + /* + * If the first lock we have locked is in the AIL, we must TRY to get + * the second lock. If we can't get it, we must release the first one + * and try again. + */ + lp = (xfs_log_item_t *)ip0->i_itemp; + if (lp && (lp->li_flags & XFS_LI_IN_AIL)) { + if (!xfs_ilock_nowait(ip1, xfs_lock_inumorder(lock_mode, 1))) { + xfs_iunlock(ip0, lock_mode); + if ((++attempts % 5) == 0) + delay(1); /* Don't just spin the CPU */ + goto again; + } + } else { + xfs_ilock(ip1, xfs_lock_inumorder(lock_mode, 1)); + } +} + + +void +__xfs_iflock( + struct xfs_inode *ip) +{ + wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT); + DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT); + + do { + prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE); + if (xfs_isiflocked(ip)) + io_schedule(); + } while (!xfs_iflock_nowait(ip)); + + finish_wait(wq, &wait.wait); +} + +STATIC uint +_xfs_dic2xflags( + __uint16_t di_flags) +{ + uint flags = 0; + + if (di_flags & XFS_DIFLAG_ANY) { + if (di_flags & XFS_DIFLAG_REALTIME) + flags |= XFS_XFLAG_REALTIME; + if (di_flags & XFS_DIFLAG_PREALLOC) + flags |= XFS_XFLAG_PREALLOC; + if (di_flags & XFS_DIFLAG_IMMUTABLE) + flags |= XFS_XFLAG_IMMUTABLE; + if (di_flags & XFS_DIFLAG_APPEND) + flags |= XFS_XFLAG_APPEND; + if (di_flags & XFS_DIFLAG_SYNC) + flags |= XFS_XFLAG_SYNC; + if (di_flags & XFS_DIFLAG_NOATIME) + flags |= XFS_XFLAG_NOATIME; + if (di_flags & XFS_DIFLAG_NODUMP) + flags |= XFS_XFLAG_NODUMP; + if (di_flags & XFS_DIFLAG_RTINHERIT) + flags |= XFS_XFLAG_RTINHERIT; + if (di_flags & XFS_DIFLAG_PROJINHERIT) + flags |= XFS_XFLAG_PROJINHERIT; + if (di_flags & XFS_DIFLAG_NOSYMLINKS) + flags |= XFS_XFLAG_NOSYMLINKS; + if (di_flags & XFS_DIFLAG_EXTSIZE) + flags |= XFS_XFLAG_EXTSIZE; + if (di_flags & XFS_DIFLAG_EXTSZINHERIT) + flags |= XFS_XFLAG_EXTSZINHERIT; + if (di_flags & XFS_DIFLAG_NODEFRAG) + flags |= XFS_XFLAG_NODEFRAG; + if (di_flags & XFS_DIFLAG_FILESTREAM) + flags |= XFS_XFLAG_FILESTREAM; + } + + return flags; +} + +uint +xfs_ip2xflags( + xfs_inode_t *ip) +{ + xfs_icdinode_t *dic = &ip->i_d; + + return _xfs_dic2xflags(dic->di_flags) | + (XFS_IFORK_Q(ip) ? XFS_XFLAG_HASATTR : 0); +} + +uint +xfs_dic2xflags( + xfs_dinode_t *dip) +{ + return _xfs_dic2xflags(be16_to_cpu(dip->di_flags)) | + (XFS_DFORK_Q(dip) ? XFS_XFLAG_HASATTR : 0); +} + +/* + * Lookups up an inode from "name". If ci_name is not NULL, then a CI match + * is allowed, otherwise it has to be an exact match. If a CI match is found, + * ci_name->name will point to a the actual name (caller must free) or + * will be set to NULL if an exact match is found. + */ +int +xfs_lookup( + xfs_inode_t *dp, + struct xfs_name *name, + xfs_inode_t **ipp, + struct xfs_name *ci_name) +{ + xfs_ino_t inum; + int error; + uint lock_mode; + + trace_xfs_lookup(dp, name); + + if (XFS_FORCED_SHUTDOWN(dp->i_mount)) + return -EIO; + + lock_mode = xfs_ilock_data_map_shared(dp); + error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name); + xfs_iunlock(dp, lock_mode); + + if (error) + goto out; + + error = xfs_iget(dp->i_mount, NULL, inum, 0, 0, ipp); + if (error) + goto out_free_name; + + return 0; + +out_free_name: + if (ci_name) + kmem_free(ci_name->name); +out: + *ipp = NULL; + return error; +} + +/* + * Allocate an inode on disk and return a copy of its in-core version. + * The in-core inode is locked exclusively. Set mode, nlink, and rdev + * appropriately within the inode. The uid and gid for the inode are + * set according to the contents of the given cred structure. + * + * Use xfs_dialloc() to allocate the on-disk inode. If xfs_dialloc() + * has a free inode available, call xfs_iget() to obtain the in-core + * version of the allocated inode. Finally, fill in the inode and + * log its initial contents. In this case, ialloc_context would be + * set to NULL. + * + * If xfs_dialloc() does not have an available inode, it will replenish + * its supply by doing an allocation. Since we can only do one + * allocation within a transaction without deadlocks, we must commit + * the current transaction before returning the inode itself. + * In this case, therefore, we will set ialloc_context and return. + * The caller should then commit the current transaction, start a new + * transaction, and call xfs_ialloc() again to actually get the inode. + * + * To ensure that some other process does not grab the inode that + * was allocated during the first call to xfs_ialloc(), this routine + * also returns the [locked] bp pointing to the head of the freelist + * as ialloc_context. The caller should hold this buffer across + * the commit and pass it back into this routine on the second call. + * + * If we are allocating quota inodes, we do not have a parent inode + * to attach to or associate with (i.e. pip == NULL) because they + * are not linked into the directory structure - they are attached + * directly to the superblock - and so have no parent. + */ +int +xfs_ialloc( + xfs_trans_t *tp, + xfs_inode_t *pip, + umode_t mode, + xfs_nlink_t nlink, + xfs_dev_t rdev, + prid_t prid, + int okalloc, + xfs_buf_t **ialloc_context, + xfs_inode_t **ipp) +{ + struct xfs_mount *mp = tp->t_mountp; + xfs_ino_t ino; + xfs_inode_t *ip; + uint flags; + int error; + struct timespec tv; + + /* + * Call the space management code to pick + * the on-disk inode to be allocated. + */ + error = xfs_dialloc(tp, pip ? pip->i_ino : 0, mode, okalloc, + ialloc_context, &ino); + if (error) + return error; + if (*ialloc_context || ino == NULLFSINO) { + *ipp = NULL; + return 0; + } + ASSERT(*ialloc_context == NULL); + + /* + * Get the in-core inode with the lock held exclusively. + * This is because we're setting fields here we need + * to prevent others from looking at until we're done. + */ + error = xfs_iget(mp, tp, ino, XFS_IGET_CREATE, + XFS_ILOCK_EXCL, &ip); + if (error) + return error; + ASSERT(ip != NULL); + + /* + * We always convert v1 inodes to v2 now - we only support filesystems + * with >= v2 inode capability, so there is no reason for ever leaving + * an inode in v1 format. + */ + if (ip->i_d.di_version == 1) + ip->i_d.di_version = 2; + + ip->i_d.di_mode = mode; + ip->i_d.di_onlink = 0; + ip->i_d.di_nlink = nlink; + ASSERT(ip->i_d.di_nlink == nlink); + ip->i_d.di_uid = xfs_kuid_to_uid(current_fsuid()); + ip->i_d.di_gid = xfs_kgid_to_gid(current_fsgid()); + xfs_set_projid(ip, prid); + memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); + + if (pip && XFS_INHERIT_GID(pip)) { + ip->i_d.di_gid = pip->i_d.di_gid; + if ((pip->i_d.di_mode & S_ISGID) && S_ISDIR(mode)) { + ip->i_d.di_mode |= S_ISGID; + } + } + + /* + * If the group ID of the new file does not match the effective group + * ID or one of the supplementary group IDs, the S_ISGID bit is cleared + * (and only if the irix_sgid_inherit compatibility variable is set). + */ + if ((irix_sgid_inherit) && + (ip->i_d.di_mode & S_ISGID) && + (!in_group_p(xfs_gid_to_kgid(ip->i_d.di_gid)))) { + ip->i_d.di_mode &= ~S_ISGID; + } + + ip->i_d.di_size = 0; + ip->i_d.di_nextents = 0; + ASSERT(ip->i_d.di_nblocks == 0); + + tv = current_fs_time(mp->m_super); + ip->i_d.di_mtime.t_sec = (__int32_t)tv.tv_sec; + ip->i_d.di_mtime.t_nsec = (__int32_t)tv.tv_nsec; + ip->i_d.di_atime = ip->i_d.di_mtime; + ip->i_d.di_ctime = ip->i_d.di_mtime; + + /* + * di_gen will have been taken care of in xfs_iread. + */ + ip->i_d.di_extsize = 0; + ip->i_d.di_dmevmask = 0; + ip->i_d.di_dmstate = 0; + ip->i_d.di_flags = 0; + + if (ip->i_d.di_version == 3) { + ASSERT(ip->i_d.di_ino == ino); + ASSERT(uuid_equal(&ip->i_d.di_uuid, &mp->m_sb.sb_uuid)); + ip->i_d.di_crc = 0; + ip->i_d.di_changecount = 1; + ip->i_d.di_lsn = 0; + ip->i_d.di_flags2 = 0; + memset(&(ip->i_d.di_pad2[0]), 0, sizeof(ip->i_d.di_pad2)); + ip->i_d.di_crtime = ip->i_d.di_mtime; + } + + + flags = XFS_ILOG_CORE; + switch (mode & S_IFMT) { + case S_IFIFO: + case S_IFCHR: + case S_IFBLK: + case S_IFSOCK: + ip->i_d.di_format = XFS_DINODE_FMT_DEV; + ip->i_df.if_u2.if_rdev = rdev; + ip->i_df.if_flags = 0; + flags |= XFS_ILOG_DEV; + break; + case S_IFREG: + case S_IFDIR: + if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) { + uint di_flags = 0; + + if (S_ISDIR(mode)) { + if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) + di_flags |= XFS_DIFLAG_RTINHERIT; + if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) { + di_flags |= XFS_DIFLAG_EXTSZINHERIT; + ip->i_d.di_extsize = pip->i_d.di_extsize; + } + if (pip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) + di_flags |= XFS_DIFLAG_PROJINHERIT; + } else if (S_ISREG(mode)) { + if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) + di_flags |= XFS_DIFLAG_REALTIME; + if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) { + di_flags |= XFS_DIFLAG_EXTSIZE; + ip->i_d.di_extsize = pip->i_d.di_extsize; + } + } + if ((pip->i_d.di_flags & XFS_DIFLAG_NOATIME) && + xfs_inherit_noatime) + di_flags |= XFS_DIFLAG_NOATIME; + if ((pip->i_d.di_flags & XFS_DIFLAG_NODUMP) && + xfs_inherit_nodump) + di_flags |= XFS_DIFLAG_NODUMP; + if ((pip->i_d.di_flags & XFS_DIFLAG_SYNC) && + xfs_inherit_sync) + di_flags |= XFS_DIFLAG_SYNC; + if ((pip->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) && + xfs_inherit_nosymlinks) + di_flags |= XFS_DIFLAG_NOSYMLINKS; + if ((pip->i_d.di_flags & XFS_DIFLAG_NODEFRAG) && + xfs_inherit_nodefrag) + di_flags |= XFS_DIFLAG_NODEFRAG; + if (pip->i_d.di_flags & XFS_DIFLAG_FILESTREAM) + di_flags |= XFS_DIFLAG_FILESTREAM; + ip->i_d.di_flags |= di_flags; + } + /* FALLTHROUGH */ + case S_IFLNK: + ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS; + ip->i_df.if_flags = XFS_IFEXTENTS; + ip->i_df.if_bytes = ip->i_df.if_real_bytes = 0; + ip->i_df.if_u1.if_extents = NULL; + break; + default: + ASSERT(0); + } + /* + * Attribute fork settings for new inode. + */ + ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; + ip->i_d.di_anextents = 0; + + /* + * Log the new values stuffed into the inode. + */ + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_log_inode(tp, ip, flags); + + /* now that we have an i_mode we can setup the inode structure */ + xfs_setup_inode(ip); + + *ipp = ip; + return 0; +} + +/* + * Allocates a new inode from disk and return a pointer to the + * incore copy. This routine will internally commit the current + * transaction and allocate a new one if the Space Manager needed + * to do an allocation to replenish the inode free-list. + * + * This routine is designed to be called from xfs_create and + * xfs_create_dir. + * + */ +int +xfs_dir_ialloc( + xfs_trans_t **tpp, /* input: current transaction; + output: may be a new transaction. */ + xfs_inode_t *dp, /* directory within whose allocate + the inode. */ + umode_t mode, + xfs_nlink_t nlink, + xfs_dev_t rdev, + prid_t prid, /* project id */ + int okalloc, /* ok to allocate new space */ + xfs_inode_t **ipp, /* pointer to inode; it will be + locked. */ + int *committed) + +{ + xfs_trans_t *tp; + xfs_trans_t *ntp; + xfs_inode_t *ip; + xfs_buf_t *ialloc_context = NULL; + int code; + void *dqinfo; + uint tflags; + + tp = *tpp; + ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); + + /* + * xfs_ialloc will return a pointer to an incore inode if + * the Space Manager has an available inode on the free + * list. Otherwise, it will do an allocation and replenish + * the freelist. Since we can only do one allocation per + * transaction without deadlocks, we will need to commit the + * current transaction and start a new one. We will then + * need to call xfs_ialloc again to get the inode. + * + * If xfs_ialloc did an allocation to replenish the freelist, + * it returns the bp containing the head of the freelist as + * ialloc_context. We will hold a lock on it across the + * transaction commit so that no other process can steal + * the inode(s) that we've just allocated. + */ + code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, okalloc, + &ialloc_context, &ip); + + /* + * Return an error if we were unable to allocate a new inode. + * This should only happen if we run out of space on disk or + * encounter a disk error. + */ + if (code) { + *ipp = NULL; + return code; + } + if (!ialloc_context && !ip) { + *ipp = NULL; + return -ENOSPC; + } + + /* + * If the AGI buffer is non-NULL, then we were unable to get an + * inode in one operation. We need to commit the current + * transaction and call xfs_ialloc() again. It is guaranteed + * to succeed the second time. + */ + if (ialloc_context) { + struct xfs_trans_res tres; + + /* + * Normally, xfs_trans_commit releases all the locks. + * We call bhold to hang on to the ialloc_context across + * the commit. Holding this buffer prevents any other + * processes from doing any allocations in this + * allocation group. + */ + xfs_trans_bhold(tp, ialloc_context); + /* + * Save the log reservation so we can use + * them in the next transaction. + */ + tres.tr_logres = xfs_trans_get_log_res(tp); + tres.tr_logcount = xfs_trans_get_log_count(tp); + + /* + * We want the quota changes to be associated with the next + * transaction, NOT this one. So, detach the dqinfo from this + * and attach it to the next transaction. + */ + dqinfo = NULL; + tflags = 0; + if (tp->t_dqinfo) { + dqinfo = (void *)tp->t_dqinfo; + tp->t_dqinfo = NULL; + tflags = tp->t_flags & XFS_TRANS_DQ_DIRTY; + tp->t_flags &= ~(XFS_TRANS_DQ_DIRTY); + } + + ntp = xfs_trans_dup(tp); + code = xfs_trans_commit(tp, 0); + tp = ntp; + if (committed != NULL) { + *committed = 1; + } + /* + * If we get an error during the commit processing, + * release the buffer that is still held and return + * to the caller. + */ + if (code) { + xfs_buf_relse(ialloc_context); + if (dqinfo) { + tp->t_dqinfo = dqinfo; + xfs_trans_free_dqinfo(tp); + } + *tpp = ntp; + *ipp = NULL; + return code; + } + + /* + * transaction commit worked ok so we can drop the extra ticket + * reference that we gained in xfs_trans_dup() + */ + xfs_log_ticket_put(tp->t_ticket); + tres.tr_logflags = XFS_TRANS_PERM_LOG_RES; + code = xfs_trans_reserve(tp, &tres, 0, 0); + + /* + * Re-attach the quota info that we detached from prev trx. + */ + if (dqinfo) { + tp->t_dqinfo = dqinfo; + tp->t_flags |= tflags; + } + + if (code) { + xfs_buf_relse(ialloc_context); + *tpp = ntp; + *ipp = NULL; + return code; + } + xfs_trans_bjoin(tp, ialloc_context); + + /* + * Call ialloc again. Since we've locked out all + * other allocations in this allocation group, + * this call should always succeed. + */ + code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, + okalloc, &ialloc_context, &ip); + + /* + * If we get an error at this point, return to the caller + * so that the current transaction can be aborted. + */ + if (code) { + *tpp = tp; + *ipp = NULL; + return code; + } + ASSERT(!ialloc_context && ip); + + } else { + if (committed != NULL) + *committed = 0; + } + + *ipp = ip; + *tpp = tp; + + return 0; +} + +/* + * Decrement the link count on an inode & log the change. + * If this causes the link count to go to zero, initiate the + * logging activity required to truncate a file. + */ +int /* error */ +xfs_droplink( + xfs_trans_t *tp, + xfs_inode_t *ip) +{ + int error; + + xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); + + ASSERT (ip->i_d.di_nlink > 0); + ip->i_d.di_nlink--; + drop_nlink(VFS_I(ip)); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + error = 0; + if (ip->i_d.di_nlink == 0) { + /* + * We're dropping the last link to this file. + * Move the on-disk inode to the AGI unlinked list. + * From xfs_inactive() we will pull the inode from + * the list and free it. + */ + error = xfs_iunlink(tp, ip); + } + return error; +} + +/* + * Increment the link count on an inode & log the change. + */ +int +xfs_bumplink( + xfs_trans_t *tp, + xfs_inode_t *ip) +{ + xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); + + ASSERT(ip->i_d.di_version > 1); + ASSERT(ip->i_d.di_nlink > 0 || (VFS_I(ip)->i_state & I_LINKABLE)); + ip->i_d.di_nlink++; + inc_nlink(VFS_I(ip)); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + return 0; +} + +int +xfs_create( + xfs_inode_t *dp, + struct xfs_name *name, + umode_t mode, + xfs_dev_t rdev, + xfs_inode_t **ipp) +{ + int is_dir = S_ISDIR(mode); + struct xfs_mount *mp = dp->i_mount; + struct xfs_inode *ip = NULL; + struct xfs_trans *tp = NULL; + int error; + xfs_bmap_free_t free_list; + xfs_fsblock_t first_block; + bool unlock_dp_on_error = false; + uint cancel_flags; + int committed; + prid_t prid; + struct xfs_dquot *udqp = NULL; + struct xfs_dquot *gdqp = NULL; + struct xfs_dquot *pdqp = NULL; + struct xfs_trans_res *tres; + uint resblks; + + trace_xfs_create(dp, name); + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + prid = xfs_get_initial_prid(dp); + + /* + * Make sure that we have allocated dquot(s) on disk. + */ + error = xfs_qm_vop_dqalloc(dp, xfs_kuid_to_uid(current_fsuid()), + xfs_kgid_to_gid(current_fsgid()), prid, + XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, + &udqp, &gdqp, &pdqp); + if (error) + return error; + + if (is_dir) { + rdev = 0; + resblks = XFS_MKDIR_SPACE_RES(mp, name->len); + tres = &M_RES(mp)->tr_mkdir; + tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR); + } else { + resblks = XFS_CREATE_SPACE_RES(mp, name->len); + tres = &M_RES(mp)->tr_create; + tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE); + } + + cancel_flags = XFS_TRANS_RELEASE_LOG_RES; + + /* + * Initially assume that the file does not exist and + * reserve the resources for that case. If that is not + * the case we'll drop the one we have and get a more + * appropriate transaction later. + */ + error = xfs_trans_reserve(tp, tres, resblks, 0); + if (error == -ENOSPC) { + /* flush outstanding delalloc blocks and retry */ + xfs_flush_inodes(mp); + error = xfs_trans_reserve(tp, tres, resblks, 0); + } + if (error == -ENOSPC) { + /* No space at all so try a "no-allocation" reservation */ + resblks = 0; + error = xfs_trans_reserve(tp, tres, 0, 0); + } + if (error) { + cancel_flags = 0; + goto out_trans_cancel; + } + + xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); + unlock_dp_on_error = true; + + xfs_bmap_init(&free_list, &first_block); + + /* + * Reserve disk quota and the inode. + */ + error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, + pdqp, resblks, 1, 0); + if (error) + goto out_trans_cancel; + + if (!resblks) { + error = xfs_dir_canenter(tp, dp, name); + if (error) + goto out_trans_cancel; + } + + /* + * A newly created regular or special file just has one directory + * entry pointing to them, but a directory also the "." entry + * pointing to itself. + */ + error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev, + prid, resblks > 0, &ip, &committed); + if (error) { + if (error == -ENOSPC) + goto out_trans_cancel; + goto out_trans_abort; + } + + /* + * Now we join the directory inode to the transaction. We do not do it + * earlier because xfs_dir_ialloc might commit the previous transaction + * (and release all the locks). An error from here on will result in + * the transaction cancel unlocking dp so don't do it explicitly in the + * error path. + */ + xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); + unlock_dp_on_error = false; + + error = xfs_dir_createname(tp, dp, name, ip->i_ino, + &first_block, &free_list, resblks ? + resblks - XFS_IALLOC_SPACE_RES(mp) : 0); + if (error) { + ASSERT(error != -ENOSPC); + goto out_trans_abort; + } + xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); + + if (is_dir) { + error = xfs_dir_init(tp, ip, dp); + if (error) + goto out_bmap_cancel; + + error = xfs_bumplink(tp, dp); + if (error) + goto out_bmap_cancel; + } + + /* + * If this is a synchronous mount, make sure that the + * create transaction goes to disk before returning to + * the user. + */ + if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) + xfs_trans_set_sync(tp); + + /* + * Attach the dquot(s) to the inodes and modify them incore. + * These ids of the inode couldn't have changed since the new + * inode has been locked ever since it was created. + */ + xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp); + + error = xfs_bmap_finish(&tp, &free_list, &committed); + if (error) + goto out_bmap_cancel; + + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + if (error) + goto out_release_inode; + + xfs_qm_dqrele(udqp); + xfs_qm_dqrele(gdqp); + xfs_qm_dqrele(pdqp); + + *ipp = ip; + return 0; + + out_bmap_cancel: + xfs_bmap_cancel(&free_list); + out_trans_abort: + cancel_flags |= XFS_TRANS_ABORT; + out_trans_cancel: + xfs_trans_cancel(tp, cancel_flags); + out_release_inode: + /* + * Wait until after the current transaction is aborted to finish the + * setup of the inode and release the inode. This prevents recursive + * transactions and deadlocks from xfs_inactive. + */ + if (ip) { + xfs_finish_inode_setup(ip); + IRELE(ip); + } + + xfs_qm_dqrele(udqp); + xfs_qm_dqrele(gdqp); + xfs_qm_dqrele(pdqp); + + if (unlock_dp_on_error) + xfs_iunlock(dp, XFS_ILOCK_EXCL); + return error; +} + +int +xfs_create_tmpfile( + struct xfs_inode *dp, + struct dentry *dentry, + umode_t mode, + struct xfs_inode **ipp) +{ + struct xfs_mount *mp = dp->i_mount; + struct xfs_inode *ip = NULL; + struct xfs_trans *tp = NULL; + int error; + uint cancel_flags = XFS_TRANS_RELEASE_LOG_RES; + prid_t prid; + struct xfs_dquot *udqp = NULL; + struct xfs_dquot *gdqp = NULL; + struct xfs_dquot *pdqp = NULL; + struct xfs_trans_res *tres; + uint resblks; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + prid = xfs_get_initial_prid(dp); + + /* + * Make sure that we have allocated dquot(s) on disk. + */ + error = xfs_qm_vop_dqalloc(dp, xfs_kuid_to_uid(current_fsuid()), + xfs_kgid_to_gid(current_fsgid()), prid, + XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, + &udqp, &gdqp, &pdqp); + if (error) + return error; + + resblks = XFS_IALLOC_SPACE_RES(mp); + tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE_TMPFILE); + + tres = &M_RES(mp)->tr_create_tmpfile; + error = xfs_trans_reserve(tp, tres, resblks, 0); + if (error == -ENOSPC) { + /* No space at all so try a "no-allocation" reservation */ + resblks = 0; + error = xfs_trans_reserve(tp, tres, 0, 0); + } + if (error) { + cancel_flags = 0; + goto out_trans_cancel; + } + + error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, + pdqp, resblks, 1, 0); + if (error) + goto out_trans_cancel; + + error = xfs_dir_ialloc(&tp, dp, mode, 1, 0, + prid, resblks > 0, &ip, NULL); + if (error) { + if (error == -ENOSPC) + goto out_trans_cancel; + goto out_trans_abort; + } + + if (mp->m_flags & XFS_MOUNT_WSYNC) + xfs_trans_set_sync(tp); + + /* + * Attach the dquot(s) to the inodes and modify them incore. + * These ids of the inode couldn't have changed since the new + * inode has been locked ever since it was created. + */ + xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp); + + ip->i_d.di_nlink--; + error = xfs_iunlink(tp, ip); + if (error) + goto out_trans_abort; + + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + if (error) + goto out_release_inode; + + xfs_qm_dqrele(udqp); + xfs_qm_dqrele(gdqp); + xfs_qm_dqrele(pdqp); + + *ipp = ip; + return 0; + + out_trans_abort: + cancel_flags |= XFS_TRANS_ABORT; + out_trans_cancel: + xfs_trans_cancel(tp, cancel_flags); + out_release_inode: + /* + * Wait until after the current transaction is aborted to finish the + * setup of the inode and release the inode. This prevents recursive + * transactions and deadlocks from xfs_inactive. + */ + if (ip) { + xfs_finish_inode_setup(ip); + IRELE(ip); + } + + xfs_qm_dqrele(udqp); + xfs_qm_dqrele(gdqp); + xfs_qm_dqrele(pdqp); + + return error; +} + +int +xfs_link( + xfs_inode_t *tdp, + xfs_inode_t *sip, + struct xfs_name *target_name) +{ + xfs_mount_t *mp = tdp->i_mount; + xfs_trans_t *tp; + int error; + xfs_bmap_free_t free_list; + xfs_fsblock_t first_block; + int cancel_flags; + int committed; + int resblks; + + trace_xfs_link(tdp, target_name); + + ASSERT(!S_ISDIR(sip->i_d.di_mode)); + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + error = xfs_qm_dqattach(sip, 0); + if (error) + goto std_return; + + error = xfs_qm_dqattach(tdp, 0); + if (error) + goto std_return; + + tp = xfs_trans_alloc(mp, XFS_TRANS_LINK); + cancel_flags = XFS_TRANS_RELEASE_LOG_RES; + resblks = XFS_LINK_SPACE_RES(mp, target_name->len); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, resblks, 0); + if (error == -ENOSPC) { + resblks = 0; + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, 0, 0); + } + if (error) { + cancel_flags = 0; + goto error_return; + } + + xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL); + + xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL); + + /* + * If we are using project inheritance, we only allow hard link + * creation in our tree when the project IDs are the same; else + * the tree quota mechanism could be circumvented. + */ + if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && + (xfs_get_projid(tdp) != xfs_get_projid(sip)))) { + error = -EXDEV; + goto error_return; + } + + if (!resblks) { + error = xfs_dir_canenter(tp, tdp, target_name); + if (error) + goto error_return; + } + + xfs_bmap_init(&free_list, &first_block); + + if (sip->i_d.di_nlink == 0) { + error = xfs_iunlink_remove(tp, sip); + if (error) + goto abort_return; + } + + error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino, + &first_block, &free_list, resblks); + if (error) + goto abort_return; + xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE); + + error = xfs_bumplink(tp, sip); + if (error) + goto abort_return; + + /* + * If this is a synchronous mount, make sure that the + * link transaction goes to disk before returning to + * the user. + */ + if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) { + xfs_trans_set_sync(tp); + } + + error = xfs_bmap_finish (&tp, &free_list, &committed); + if (error) { + xfs_bmap_cancel(&free_list); + goto abort_return; + } + + return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + + abort_return: + cancel_flags |= XFS_TRANS_ABORT; + error_return: + xfs_trans_cancel(tp, cancel_flags); + std_return: + return error; +} + +/* + * Free up the underlying blocks past new_size. The new size must be smaller + * than the current size. This routine can be used both for the attribute and + * data fork, and does not modify the inode size, which is left to the caller. + * + * The transaction passed to this routine must have made a permanent log + * reservation of at least XFS_ITRUNCATE_LOG_RES. This routine may commit the + * given transaction and start new ones, so make sure everything involved in + * the transaction is tidy before calling here. Some transaction will be + * returned to the caller to be committed. The incoming transaction must + * already include the inode, and both inode locks must be held exclusively. + * The inode must also be "held" within the transaction. On return the inode + * will be "held" within the returned transaction. This routine does NOT + * require any disk space to be reserved for it within the transaction. + * + * If we get an error, we must return with the inode locked and linked into the + * current transaction. This keeps things simple for the higher level code, + * because it always knows that the inode is locked and held in the transaction + * that returns to it whether errors occur or not. We don't mark the inode + * dirty on error so that transactions can be easily aborted if possible. + */ +int +xfs_itruncate_extents( + struct xfs_trans **tpp, + struct xfs_inode *ip, + int whichfork, + xfs_fsize_t new_size) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp = *tpp; + struct xfs_trans *ntp; + xfs_bmap_free_t free_list; + xfs_fsblock_t first_block; + xfs_fileoff_t first_unmap_block; + xfs_fileoff_t last_block; + xfs_filblks_t unmap_len; + int committed; + int error = 0; + int done = 0; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + ASSERT(!atomic_read(&VFS_I(ip)->i_count) || + xfs_isilocked(ip, XFS_IOLOCK_EXCL)); + ASSERT(new_size <= XFS_ISIZE(ip)); + ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); + ASSERT(ip->i_itemp != NULL); + ASSERT(ip->i_itemp->ili_lock_flags == 0); + ASSERT(!XFS_NOT_DQATTACHED(mp, ip)); + + trace_xfs_itruncate_extents_start(ip, new_size); + + /* + * Since it is possible for space to become allocated beyond + * the end of the file (in a crash where the space is allocated + * but the inode size is not yet updated), simply remove any + * blocks which show up between the new EOF and the maximum + * possible file size. If the first block to be removed is + * beyond the maximum file size (ie it is the same as last_block), + * then there is nothing to do. + */ + first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size); + last_block = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); + if (first_unmap_block == last_block) + return 0; + + ASSERT(first_unmap_block < last_block); + unmap_len = last_block - first_unmap_block + 1; + while (!done) { + xfs_bmap_init(&free_list, &first_block); + error = xfs_bunmapi(tp, ip, + first_unmap_block, unmap_len, + xfs_bmapi_aflag(whichfork), + XFS_ITRUNC_MAX_EXTENTS, + &first_block, &free_list, + &done); + if (error) + goto out_bmap_cancel; + + /* + * Duplicate the transaction that has the permanent + * reservation and commit the old transaction. + */ + error = xfs_bmap_finish(&tp, &free_list, &committed); + if (committed) + xfs_trans_ijoin(tp, ip, 0); + if (error) + goto out_bmap_cancel; + + if (committed) { + /* + * Mark the inode dirty so it will be logged and + * moved forward in the log as part of every commit. + */ + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + } + + ntp = xfs_trans_dup(tp); + error = xfs_trans_commit(tp, 0); + tp = ntp; + + xfs_trans_ijoin(tp, ip, 0); + + if (error) + goto out; + + /* + * Transaction commit worked ok so we can drop the extra ticket + * reference that we gained in xfs_trans_dup() + */ + xfs_log_ticket_put(tp->t_ticket); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); + if (error) + goto out; + } + + /* + * Always re-log the inode so that our permanent transaction can keep + * on rolling it forward in the log. + */ + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + trace_xfs_itruncate_extents_end(ip, new_size); + +out: + *tpp = tp; + return error; +out_bmap_cancel: + /* + * If the bunmapi call encounters an error, return to the caller where + * the transaction can be properly aborted. We just need to make sure + * we're not holding any resources that we were not when we came in. + */ + xfs_bmap_cancel(&free_list); + goto out; +} + +int +xfs_release( + xfs_inode_t *ip) +{ + xfs_mount_t *mp = ip->i_mount; + int error; + + if (!S_ISREG(ip->i_d.di_mode) || (ip->i_d.di_mode == 0)) + return 0; + + /* If this is a read-only mount, don't do this (would generate I/O) */ + if (mp->m_flags & XFS_MOUNT_RDONLY) + return 0; + + if (!XFS_FORCED_SHUTDOWN(mp)) { + int truncated; + + /* + * If we previously truncated this file and removed old data + * in the process, we want to initiate "early" writeout on + * the last close. This is an attempt to combat the notorious + * NULL files problem which is particularly noticeable from a + * truncate down, buffered (re-)write (delalloc), followed by + * a crash. What we are effectively doing here is + * significantly reducing the time window where we'd otherwise + * be exposed to that problem. + */ + truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED); + if (truncated) { + xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE); + if (ip->i_delayed_blks > 0) { + error = filemap_flush(VFS_I(ip)->i_mapping); + if (error) + return error; + } + } + } + + if (ip->i_d.di_nlink == 0) + return 0; + + if (xfs_can_free_eofblocks(ip, false)) { + + /* + * If we can't get the iolock just skip truncating the blocks + * past EOF because we could deadlock with the mmap_sem + * otherwise. We'll get another chance to drop them once the + * last reference to the inode is dropped, so we'll never leak + * blocks permanently. + * + * Further, check if the inode is being opened, written and + * closed frequently and we have delayed allocation blocks + * outstanding (e.g. streaming writes from the NFS server), + * truncating the blocks past EOF will cause fragmentation to + * occur. + * + * In this case don't do the truncation, either, but we have to + * be careful how we detect this case. Blocks beyond EOF show + * up as i_delayed_blks even when the inode is clean, so we + * need to truncate them away first before checking for a dirty + * release. Hence on the first dirty close we will still remove + * the speculative allocation, but after that we will leave it + * in place. + */ + if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE)) + return 0; + + error = xfs_free_eofblocks(mp, ip, true); + if (error && error != -EAGAIN) + return error; + + /* delalloc blocks after truncation means it really is dirty */ + if (ip->i_delayed_blks) + xfs_iflags_set(ip, XFS_IDIRTY_RELEASE); + } + return 0; +} + +/* + * xfs_inactive_truncate + * + * Called to perform a truncate when an inode becomes unlinked. + */ +STATIC int +xfs_inactive_truncate( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + int error; + + tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); + if (error) { + ASSERT(XFS_FORCED_SHUTDOWN(mp)); + xfs_trans_cancel(tp, 0); + return error; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + + /* + * Log the inode size first to prevent stale data exposure in the event + * of a system crash before the truncate completes. See the related + * comment in xfs_setattr_size() for details. + */ + ip->i_d.di_size = 0; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0); + if (error) + goto error_trans_cancel; + + ASSERT(ip->i_d.di_nextents == 0); + + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + if (error) + goto error_unlock; + + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return 0; + +error_trans_cancel: + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); +error_unlock: + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; +} + +/* + * xfs_inactive_ifree() + * + * Perform the inode free when an inode is unlinked. + */ +STATIC int +xfs_inactive_ifree( + struct xfs_inode *ip) +{ + xfs_bmap_free_t free_list; + xfs_fsblock_t first_block; + int committed; + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + int error; + + tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE); + + /* + * The ifree transaction might need to allocate blocks for record + * insertion to the finobt. We don't want to fail here at ENOSPC, so + * allow ifree to dip into the reserved block pool if necessary. + * + * Freeing large sets of inodes generally means freeing inode chunks, + * directory and file data blocks, so this should be relatively safe. + * Only under severe circumstances should it be possible to free enough + * inodes to exhaust the reserve block pool via finobt expansion while + * at the same time not creating free space in the filesystem. + * + * Send a warning if the reservation does happen to fail, as the inode + * now remains allocated and sits on the unlinked list until the fs is + * repaired. + */ + tp->t_flags |= XFS_TRANS_RESERVE; + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ifree, + XFS_IFREE_SPACE_RES(mp), 0); + if (error) { + if (error == -ENOSPC) { + xfs_warn_ratelimited(mp, + "Failed to remove inode(s) from unlinked list. " + "Please free space, unmount and run xfs_repair."); + } else { + ASSERT(XFS_FORCED_SHUTDOWN(mp)); + } + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES); + return error; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + + xfs_bmap_init(&free_list, &first_block); + error = xfs_ifree(tp, ip, &free_list); + if (error) { + /* + * If we fail to free the inode, shut down. The cancel + * might do that, we need to make sure. Otherwise the + * inode might be lost for a long time or forever. + */ + if (!XFS_FORCED_SHUTDOWN(mp)) { + xfs_notice(mp, "%s: xfs_ifree returned error %d", + __func__, error); + xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); + } + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; + } + + /* + * Credit the quota account(s). The inode is gone. + */ + xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, -1); + + /* + * Just ignore errors at this point. There is nothing we can + * do except to try to keep going. Make sure it's not a silent + * error. + */ + error = xfs_bmap_finish(&tp, &free_list, &committed); + if (error) + xfs_notice(mp, "%s: xfs_bmap_finish returned error %d", + __func__, error); + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + if (error) + xfs_notice(mp, "%s: xfs_trans_commit returned error %d", + __func__, error); + + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return 0; +} + +/* + * xfs_inactive + * + * This is called when the vnode reference count for the vnode + * goes to zero. If the file has been unlinked, then it must + * now be truncated. Also, we clear all of the read-ahead state + * kept for the inode here since the file is now closed. + */ +void +xfs_inactive( + xfs_inode_t *ip) +{ + struct xfs_mount *mp; + int error; + int truncate = 0; + + /* + * If the inode is already free, then there can be nothing + * to clean up here. + */ + if (ip->i_d.di_mode == 0) { + ASSERT(ip->i_df.if_real_bytes == 0); + ASSERT(ip->i_df.if_broot_bytes == 0); + return; + } + + mp = ip->i_mount; + + /* If this is a read-only mount, don't do this (would generate I/O) */ + if (mp->m_flags & XFS_MOUNT_RDONLY) + return; + + if (ip->i_d.di_nlink != 0) { + /* + * force is true because we are evicting an inode from the + * cache. Post-eof blocks must be freed, lest we end up with + * broken free space accounting. + */ + if (xfs_can_free_eofblocks(ip, true)) + xfs_free_eofblocks(mp, ip, false); + + return; + } + + if (S_ISREG(ip->i_d.di_mode) && + (ip->i_d.di_size != 0 || XFS_ISIZE(ip) != 0 || + ip->i_d.di_nextents > 0 || ip->i_delayed_blks > 0)) + truncate = 1; + + error = xfs_qm_dqattach(ip, 0); + if (error) + return; + + if (S_ISLNK(ip->i_d.di_mode)) + error = xfs_inactive_symlink(ip); + else if (truncate) + error = xfs_inactive_truncate(ip); + if (error) + return; + + /* + * If there are attributes associated with the file then blow them away + * now. The code calls a routine that recursively deconstructs the + * attribute fork. If also blows away the in-core attribute fork. + */ + if (XFS_IFORK_Q(ip)) { + error = xfs_attr_inactive(ip); + if (error) + return; + } + + ASSERT(!ip->i_afp); + ASSERT(ip->i_d.di_anextents == 0); + ASSERT(ip->i_d.di_forkoff == 0); + + /* + * Free the inode. + */ + error = xfs_inactive_ifree(ip); + if (error) + return; + + /* + * Release the dquots held by inode, if any. + */ + xfs_qm_dqdetach(ip); +} + +/* + * This is called when the inode's link count goes to 0. + * We place the on-disk inode on a list in the AGI. It + * will be pulled from this list when the inode is freed. + */ +int +xfs_iunlink( + xfs_trans_t *tp, + xfs_inode_t *ip) +{ + xfs_mount_t *mp; + xfs_agi_t *agi; + xfs_dinode_t *dip; + xfs_buf_t *agibp; + xfs_buf_t *ibp; + xfs_agino_t agino; + short bucket_index; + int offset; + int error; + + ASSERT(ip->i_d.di_nlink == 0); + ASSERT(ip->i_d.di_mode != 0); + + mp = tp->t_mountp; + + /* + * Get the agi buffer first. It ensures lock ordering + * on the list. + */ + error = xfs_read_agi(mp, tp, XFS_INO_TO_AGNO(mp, ip->i_ino), &agibp); + if (error) + return error; + agi = XFS_BUF_TO_AGI(agibp); + + /* + * Get the index into the agi hash table for the + * list this inode will go on. + */ + agino = XFS_INO_TO_AGINO(mp, ip->i_ino); + ASSERT(agino != 0); + bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; + ASSERT(agi->agi_unlinked[bucket_index]); + ASSERT(be32_to_cpu(agi->agi_unlinked[bucket_index]) != agino); + + if (agi->agi_unlinked[bucket_index] != cpu_to_be32(NULLAGINO)) { + /* + * There is already another inode in the bucket we need + * to add ourselves to. Add us at the front of the list. + * Here we put the head pointer into our next pointer, + * and then we fall through to point the head at us. + */ + error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, + 0, 0); + if (error) + return error; + + ASSERT(dip->di_next_unlinked == cpu_to_be32(NULLAGINO)); + dip->di_next_unlinked = agi->agi_unlinked[bucket_index]; + offset = ip->i_imap.im_boffset + + offsetof(xfs_dinode_t, di_next_unlinked); + + /* need to recalc the inode CRC if appropriate */ + xfs_dinode_calc_crc(mp, dip); + + xfs_trans_inode_buf(tp, ibp); + xfs_trans_log_buf(tp, ibp, offset, + (offset + sizeof(xfs_agino_t) - 1)); + xfs_inobp_check(mp, ibp); + } + + /* + * Point the bucket head pointer at the inode being inserted. + */ + ASSERT(agino != 0); + agi->agi_unlinked[bucket_index] = cpu_to_be32(agino); + offset = offsetof(xfs_agi_t, agi_unlinked) + + (sizeof(xfs_agino_t) * bucket_index); + xfs_trans_buf_set_type(tp, agibp, XFS_BLFT_AGI_BUF); + xfs_trans_log_buf(tp, agibp, offset, + (offset + sizeof(xfs_agino_t) - 1)); + return 0; +} + +/* + * Pull the on-disk inode from the AGI unlinked list. + */ +STATIC int +xfs_iunlink_remove( + xfs_trans_t *tp, + xfs_inode_t *ip) +{ + xfs_ino_t next_ino; + xfs_mount_t *mp; + xfs_agi_t *agi; + xfs_dinode_t *dip; + xfs_buf_t *agibp; + xfs_buf_t *ibp; + xfs_agnumber_t agno; + xfs_agino_t agino; + xfs_agino_t next_agino; + xfs_buf_t *last_ibp; + xfs_dinode_t *last_dip = NULL; + short bucket_index; + int offset, last_offset = 0; + int error; + + mp = tp->t_mountp; + agno = XFS_INO_TO_AGNO(mp, ip->i_ino); + + /* + * Get the agi buffer first. It ensures lock ordering + * on the list. + */ + error = xfs_read_agi(mp, tp, agno, &agibp); + if (error) + return error; + + agi = XFS_BUF_TO_AGI(agibp); + + /* + * Get the index into the agi hash table for the + * list this inode will go on. + */ + agino = XFS_INO_TO_AGINO(mp, ip->i_ino); + ASSERT(agino != 0); + bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; + ASSERT(agi->agi_unlinked[bucket_index] != cpu_to_be32(NULLAGINO)); + ASSERT(agi->agi_unlinked[bucket_index]); + + if (be32_to_cpu(agi->agi_unlinked[bucket_index]) == agino) { + /* + * We're at the head of the list. Get the inode's on-disk + * buffer to see if there is anyone after us on the list. + * Only modify our next pointer if it is not already NULLAGINO. + * This saves us the overhead of dealing with the buffer when + * there is no need to change it. + */ + error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, + 0, 0); + if (error) { + xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.", + __func__, error); + return error; + } + next_agino = be32_to_cpu(dip->di_next_unlinked); + ASSERT(next_agino != 0); + if (next_agino != NULLAGINO) { + dip->di_next_unlinked = cpu_to_be32(NULLAGINO); + offset = ip->i_imap.im_boffset + + offsetof(xfs_dinode_t, di_next_unlinked); + + /* need to recalc the inode CRC if appropriate */ + xfs_dinode_calc_crc(mp, dip); + + xfs_trans_inode_buf(tp, ibp); + xfs_trans_log_buf(tp, ibp, offset, + (offset + sizeof(xfs_agino_t) - 1)); + xfs_inobp_check(mp, ibp); + } else { + xfs_trans_brelse(tp, ibp); + } + /* + * Point the bucket head pointer at the next inode. + */ + ASSERT(next_agino != 0); + ASSERT(next_agino != agino); + agi->agi_unlinked[bucket_index] = cpu_to_be32(next_agino); + offset = offsetof(xfs_agi_t, agi_unlinked) + + (sizeof(xfs_agino_t) * bucket_index); + xfs_trans_buf_set_type(tp, agibp, XFS_BLFT_AGI_BUF); + xfs_trans_log_buf(tp, agibp, offset, + (offset + sizeof(xfs_agino_t) - 1)); + } else { + /* + * We need to search the list for the inode being freed. + */ + next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]); + last_ibp = NULL; + while (next_agino != agino) { + struct xfs_imap imap; + + if (last_ibp) + xfs_trans_brelse(tp, last_ibp); + + imap.im_blkno = 0; + next_ino = XFS_AGINO_TO_INO(mp, agno, next_agino); + + error = xfs_imap(mp, tp, next_ino, &imap, 0); + if (error) { + xfs_warn(mp, + "%s: xfs_imap returned error %d.", + __func__, error); + return error; + } + + error = xfs_imap_to_bp(mp, tp, &imap, &last_dip, + &last_ibp, 0, 0); + if (error) { + xfs_warn(mp, + "%s: xfs_imap_to_bp returned error %d.", + __func__, error); + return error; + } + + last_offset = imap.im_boffset; + next_agino = be32_to_cpu(last_dip->di_next_unlinked); + ASSERT(next_agino != NULLAGINO); + ASSERT(next_agino != 0); + } + + /* + * Now last_ibp points to the buffer previous to us on the + * unlinked list. Pull us from the list. + */ + error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, + 0, 0); + if (error) { + xfs_warn(mp, "%s: xfs_imap_to_bp(2) returned error %d.", + __func__, error); + return error; + } + next_agino = be32_to_cpu(dip->di_next_unlinked); + ASSERT(next_agino != 0); + ASSERT(next_agino != agino); + if (next_agino != NULLAGINO) { + dip->di_next_unlinked = cpu_to_be32(NULLAGINO); + offset = ip->i_imap.im_boffset + + offsetof(xfs_dinode_t, di_next_unlinked); + + /* need to recalc the inode CRC if appropriate */ + xfs_dinode_calc_crc(mp, dip); + + xfs_trans_inode_buf(tp, ibp); + xfs_trans_log_buf(tp, ibp, offset, + (offset + sizeof(xfs_agino_t) - 1)); + xfs_inobp_check(mp, ibp); + } else { + xfs_trans_brelse(tp, ibp); + } + /* + * Point the previous inode on the list to the next inode. + */ + last_dip->di_next_unlinked = cpu_to_be32(next_agino); + ASSERT(next_agino != 0); + offset = last_offset + offsetof(xfs_dinode_t, di_next_unlinked); + + /* need to recalc the inode CRC if appropriate */ + xfs_dinode_calc_crc(mp, last_dip); + + xfs_trans_inode_buf(tp, last_ibp); + xfs_trans_log_buf(tp, last_ibp, offset, + (offset + sizeof(xfs_agino_t) - 1)); + xfs_inobp_check(mp, last_ibp); + } + return 0; +} + +/* + * A big issue when freeing the inode cluster is that we _cannot_ skip any + * inodes that are in memory - they all must be marked stale and attached to + * the cluster buffer. + */ +STATIC int +xfs_ifree_cluster( + xfs_inode_t *free_ip, + xfs_trans_t *tp, + xfs_ino_t inum) +{ + xfs_mount_t *mp = free_ip->i_mount; + int blks_per_cluster; + int inodes_per_cluster; + int nbufs; + int i, j; + xfs_daddr_t blkno; + xfs_buf_t *bp; + xfs_inode_t *ip; + xfs_inode_log_item_t *iip; + xfs_log_item_t *lip; + struct xfs_perag *pag; + + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum)); + blks_per_cluster = xfs_icluster_size_fsb(mp); + inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog; + nbufs = mp->m_ialloc_blks / blks_per_cluster; + + for (j = 0; j < nbufs; j++, inum += inodes_per_cluster) { + blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum), + XFS_INO_TO_AGBNO(mp, inum)); + + /* + * We obtain and lock the backing buffer first in the process + * here, as we have to ensure that any dirty inode that we + * can't get the flush lock on is attached to the buffer. + * If we scan the in-memory inodes first, then buffer IO can + * complete before we get a lock on it, and hence we may fail + * to mark all the active inodes on the buffer stale. + */ + bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, + mp->m_bsize * blks_per_cluster, + XBF_UNMAPPED); + + if (!bp) + return -ENOMEM; + + /* + * This buffer may not have been correctly initialised as we + * didn't read it from disk. That's not important because we are + * only using to mark the buffer as stale in the log, and to + * attach stale cached inodes on it. That means it will never be + * dispatched for IO. If it is, we want to know about it, and we + * want it to fail. We can acheive this by adding a write + * verifier to the buffer. + */ + bp->b_ops = &xfs_inode_buf_ops; + + /* + * Walk the inodes already attached to the buffer and mark them + * stale. These will all have the flush locks held, so an + * in-memory inode walk can't lock them. By marking them all + * stale first, we will not attempt to lock them in the loop + * below as the XFS_ISTALE flag will be set. + */ + lip = bp->b_fspriv; + while (lip) { + if (lip->li_type == XFS_LI_INODE) { + iip = (xfs_inode_log_item_t *)lip; + ASSERT(iip->ili_logged == 1); + lip->li_cb = xfs_istale_done; + xfs_trans_ail_copy_lsn(mp->m_ail, + &iip->ili_flush_lsn, + &iip->ili_item.li_lsn); + xfs_iflags_set(iip->ili_inode, XFS_ISTALE); + } + lip = lip->li_bio_list; + } + + + /* + * For each inode in memory attempt to add it to the inode + * buffer and set it up for being staled on buffer IO + * completion. This is safe as we've locked out tail pushing + * and flushing by locking the buffer. + * + * We have already marked every inode that was part of a + * transaction stale above, which means there is no point in + * even trying to lock them. + */ + for (i = 0; i < inodes_per_cluster; i++) { +retry: + rcu_read_lock(); + ip = radix_tree_lookup(&pag->pag_ici_root, + XFS_INO_TO_AGINO(mp, (inum + i))); + + /* Inode not in memory, nothing to do */ + if (!ip) { + rcu_read_unlock(); + continue; + } + + /* + * because this is an RCU protected lookup, we could + * find a recently freed or even reallocated inode + * during the lookup. We need to check under the + * i_flags_lock for a valid inode here. Skip it if it + * is not valid, the wrong inode or stale. + */ + spin_lock(&ip->i_flags_lock); + if (ip->i_ino != inum + i || + __xfs_iflags_test(ip, XFS_ISTALE)) { + spin_unlock(&ip->i_flags_lock); + rcu_read_unlock(); + continue; + } + spin_unlock(&ip->i_flags_lock); + + /* + * Don't try to lock/unlock the current inode, but we + * _cannot_ skip the other inodes that we did not find + * in the list attached to the buffer and are not + * already marked stale. If we can't lock it, back off + * and retry. + */ + if (ip != free_ip && + !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { + rcu_read_unlock(); + delay(1); + goto retry; + } + rcu_read_unlock(); + + xfs_iflock(ip); + xfs_iflags_set(ip, XFS_ISTALE); + + /* + * we don't need to attach clean inodes or those only + * with unlogged changes (which we throw away, anyway). + */ + iip = ip->i_itemp; + if (!iip || xfs_inode_clean(ip)) { + ASSERT(ip != free_ip); + xfs_ifunlock(ip); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + continue; + } + + iip->ili_last_fields = iip->ili_fields; + iip->ili_fields = 0; + iip->ili_logged = 1; + xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, + &iip->ili_item.li_lsn); + + xfs_buf_attach_iodone(bp, xfs_istale_done, + &iip->ili_item); + + if (ip != free_ip) + xfs_iunlock(ip, XFS_ILOCK_EXCL); + } + + xfs_trans_stale_inode_buf(tp, bp); + xfs_trans_binval(tp, bp); + } + + xfs_perag_put(pag); + return 0; +} + +/* + * This is called to return an inode to the inode free list. + * The inode should already be truncated to 0 length and have + * no pages associated with it. This routine also assumes that + * the inode is already a part of the transaction. + * + * The on-disk copy of the inode will have been added to the list + * of unlinked inodes in the AGI. We need to remove the inode from + * that list atomically with respect to freeing it here. + */ +int +xfs_ifree( + xfs_trans_t *tp, + xfs_inode_t *ip, + xfs_bmap_free_t *flist) +{ + int error; + int delete; + xfs_ino_t first_ino; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + ASSERT(ip->i_d.di_nlink == 0); + ASSERT(ip->i_d.di_nextents == 0); + ASSERT(ip->i_d.di_anextents == 0); + ASSERT(ip->i_d.di_size == 0 || !S_ISREG(ip->i_d.di_mode)); + ASSERT(ip->i_d.di_nblocks == 0); + + /* + * Pull the on-disk inode from the AGI unlinked list. + */ + error = xfs_iunlink_remove(tp, ip); + if (error) + return error; + + error = xfs_difree(tp, ip->i_ino, flist, &delete, &first_ino); + if (error) + return error; + + ip->i_d.di_mode = 0; /* mark incore inode as free */ + ip->i_d.di_flags = 0; + ip->i_d.di_dmevmask = 0; + ip->i_d.di_forkoff = 0; /* mark the attr fork not in use */ + ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS; + ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; + /* + * Bump the generation count so no one will be confused + * by reincarnations of this inode. + */ + ip->i_d.di_gen++; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + if (delete) + error = xfs_ifree_cluster(ip, tp, first_ino); + + return error; +} + +/* + * This is called to unpin an inode. The caller must have the inode locked + * in at least shared mode so that the buffer cannot be subsequently pinned + * once someone is waiting for it to be unpinned. + */ +static void +xfs_iunpin( + struct xfs_inode *ip) +{ + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); + + trace_xfs_inode_unpin_nowait(ip, _RET_IP_); + + /* Give the log a push to start the unpinning I/O */ + xfs_log_force_lsn(ip->i_mount, ip->i_itemp->ili_last_lsn, 0); + +} + +static void +__xfs_iunpin_wait( + struct xfs_inode *ip) +{ + wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IPINNED_BIT); + DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IPINNED_BIT); + + xfs_iunpin(ip); + + do { + prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); + if (xfs_ipincount(ip)) + io_schedule(); + } while (xfs_ipincount(ip)); + finish_wait(wq, &wait.wait); +} + +void +xfs_iunpin_wait( + struct xfs_inode *ip) +{ + if (xfs_ipincount(ip)) + __xfs_iunpin_wait(ip); +} + +/* + * Removing an inode from the namespace involves removing the directory entry + * and dropping the link count on the inode. Removing the directory entry can + * result in locking an AGF (directory blocks were freed) and removing a link + * count can result in placing the inode on an unlinked list which results in + * locking an AGI. + * + * The big problem here is that we have an ordering constraint on AGF and AGI + * locking - inode allocation locks the AGI, then can allocate a new extent for + * new inodes, locking the AGF after the AGI. Similarly, freeing the inode + * removes the inode from the unlinked list, requiring that we lock the AGI + * first, and then freeing the inode can result in an inode chunk being freed + * and hence freeing disk space requiring that we lock an AGF. + * + * Hence the ordering that is imposed by other parts of the code is AGI before + * AGF. This means we cannot remove the directory entry before we drop the inode + * reference count and put it on the unlinked list as this results in a lock + * order of AGF then AGI, and this can deadlock against inode allocation and + * freeing. Therefore we must drop the link counts before we remove the + * directory entry. + * + * This is still safe from a transactional point of view - it is not until we + * get to xfs_bmap_finish() that we have the possibility of multiple + * transactions in this operation. Hence as long as we remove the directory + * entry and drop the link count in the first transaction of the remove + * operation, there are no transactional constraints on the ordering here. + */ +int +xfs_remove( + xfs_inode_t *dp, + struct xfs_name *name, + xfs_inode_t *ip) +{ + xfs_mount_t *mp = dp->i_mount; + xfs_trans_t *tp = NULL; + int is_dir = S_ISDIR(ip->i_d.di_mode); + int error = 0; + xfs_bmap_free_t free_list; + xfs_fsblock_t first_block; + int cancel_flags; + int committed; + uint resblks; + + trace_xfs_remove(dp, name); + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + error = xfs_qm_dqattach(dp, 0); + if (error) + goto std_return; + + error = xfs_qm_dqattach(ip, 0); + if (error) + goto std_return; + + if (is_dir) + tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR); + else + tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE); + cancel_flags = XFS_TRANS_RELEASE_LOG_RES; + + /* + * We try to get the real space reservation first, + * allowing for directory btree deletion(s) implying + * possible bmap insert(s). If we can't get the space + * reservation then we use 0 instead, and avoid the bmap + * btree insert(s) in the directory code by, if the bmap + * insert tries to happen, instead trimming the LAST + * block from the directory. + */ + resblks = XFS_REMOVE_SPACE_RES(mp); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_remove, resblks, 0); + if (error == -ENOSPC) { + resblks = 0; + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_remove, 0, 0); + } + if (error) { + ASSERT(error != -ENOSPC); + cancel_flags = 0; + goto out_trans_cancel; + } + + xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL); + + xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + + /* + * If we're removing a directory perform some additional validation. + */ + cancel_flags |= XFS_TRANS_ABORT; + if (is_dir) { + ASSERT(ip->i_d.di_nlink >= 2); + if (ip->i_d.di_nlink != 2) { + error = -ENOTEMPTY; + goto out_trans_cancel; + } + if (!xfs_dir_isempty(ip)) { + error = -ENOTEMPTY; + goto out_trans_cancel; + } + + /* Drop the link from ip's "..". */ + error = xfs_droplink(tp, dp); + if (error) + goto out_trans_cancel; + + /* Drop the "." link from ip to self. */ + error = xfs_droplink(tp, ip); + if (error) + goto out_trans_cancel; + } else { + /* + * When removing a non-directory we need to log the parent + * inode here. For a directory this is done implicitly + * by the xfs_droplink call for the ".." entry. + */ + xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); + } + xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + + /* Drop the link from dp to ip. */ + error = xfs_droplink(tp, ip); + if (error) + goto out_trans_cancel; + + xfs_bmap_init(&free_list, &first_block); + error = xfs_dir_removename(tp, dp, name, ip->i_ino, + &first_block, &free_list, resblks); + if (error) { + ASSERT(error != -ENOENT); + goto out_bmap_cancel; + } + + /* + * If this is a synchronous mount, make sure that the + * remove transaction goes to disk before returning to + * the user. + */ + if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) + xfs_trans_set_sync(tp); + + error = xfs_bmap_finish(&tp, &free_list, &committed); + if (error) + goto out_bmap_cancel; + + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + if (error) + goto std_return; + + if (is_dir && xfs_inode_is_filestream(ip)) + xfs_filestream_deassociate(ip); + + return 0; + + out_bmap_cancel: + xfs_bmap_cancel(&free_list); + out_trans_cancel: + xfs_trans_cancel(tp, cancel_flags); + std_return: + return error; +} + +/* + * Enter all inodes for a rename transaction into a sorted array. + */ +#define __XFS_SORT_INODES 5 +STATIC void +xfs_sort_for_rename( + struct xfs_inode *dp1, /* in: old (source) directory inode */ + struct xfs_inode *dp2, /* in: new (target) directory inode */ + struct xfs_inode *ip1, /* in: inode of old entry */ + struct xfs_inode *ip2, /* in: inode of new entry */ + struct xfs_inode *wip, /* in: whiteout inode */ + struct xfs_inode **i_tab,/* out: sorted array of inodes */ + int *num_inodes) /* in/out: inodes in array */ +{ + int i, j; + + ASSERT(*num_inodes == __XFS_SORT_INODES); + memset(i_tab, 0, *num_inodes * sizeof(struct xfs_inode *)); + + /* + * i_tab contains a list of pointers to inodes. We initialize + * the table here & we'll sort it. We will then use it to + * order the acquisition of the inode locks. + * + * Note that the table may contain duplicates. e.g., dp1 == dp2. + */ + i = 0; + i_tab[i++] = dp1; + i_tab[i++] = dp2; + i_tab[i++] = ip1; + if (ip2) + i_tab[i++] = ip2; + if (wip) + i_tab[i++] = wip; + *num_inodes = i; + + /* + * Sort the elements via bubble sort. (Remember, there are at + * most 5 elements to sort, so this is adequate.) + */ + for (i = 0; i < *num_inodes; i++) { + for (j = 1; j < *num_inodes; j++) { + if (i_tab[j]->i_ino < i_tab[j-1]->i_ino) { + struct xfs_inode *temp = i_tab[j]; + i_tab[j] = i_tab[j-1]; + i_tab[j-1] = temp; + } + } + } +} + +static int +xfs_finish_rename( + struct xfs_trans *tp, + struct xfs_bmap_free *free_list) +{ + int committed = 0; + int error; + + /* + * If this is a synchronous mount, make sure that the rename transaction + * goes to disk before returning to the user. + */ + if (tp->t_mountp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) + xfs_trans_set_sync(tp); + + error = xfs_bmap_finish(&tp, free_list, &committed); + if (error) { + xfs_bmap_cancel(free_list); + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); + return error; + } + + return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); +} + +/* + * xfs_cross_rename() + * + * responsible for handling RENAME_EXCHANGE flag in renameat2() sytemcall + */ +STATIC int +xfs_cross_rename( + struct xfs_trans *tp, + struct xfs_inode *dp1, + struct xfs_name *name1, + struct xfs_inode *ip1, + struct xfs_inode *dp2, + struct xfs_name *name2, + struct xfs_inode *ip2, + struct xfs_bmap_free *free_list, + xfs_fsblock_t *first_block, + int spaceres) +{ + int error = 0; + int ip1_flags = 0; + int ip2_flags = 0; + int dp2_flags = 0; + + /* Swap inode number for dirent in first parent */ + error = xfs_dir_replace(tp, dp1, name1, + ip2->i_ino, + first_block, free_list, spaceres); + if (error) + goto out_trans_abort; + + /* Swap inode number for dirent in second parent */ + error = xfs_dir_replace(tp, dp2, name2, + ip1->i_ino, + first_block, free_list, spaceres); + if (error) + goto out_trans_abort; + + /* + * If we're renaming one or more directories across different parents, + * update the respective ".." entries (and link counts) to match the new + * parents. + */ + if (dp1 != dp2) { + dp2_flags = XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG; + + if (S_ISDIR(ip2->i_d.di_mode)) { + error = xfs_dir_replace(tp, ip2, &xfs_name_dotdot, + dp1->i_ino, first_block, + free_list, spaceres); + if (error) + goto out_trans_abort; + + /* transfer ip2 ".." reference to dp1 */ + if (!S_ISDIR(ip1->i_d.di_mode)) { + error = xfs_droplink(tp, dp2); + if (error) + goto out_trans_abort; + error = xfs_bumplink(tp, dp1); + if (error) + goto out_trans_abort; + } + + /* + * Although ip1 isn't changed here, userspace needs + * to be warned about the change, so that applications + * relying on it (like backup ones), will properly + * notify the change + */ + ip1_flags |= XFS_ICHGTIME_CHG; + ip2_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG; + } + + if (S_ISDIR(ip1->i_d.di_mode)) { + error = xfs_dir_replace(tp, ip1, &xfs_name_dotdot, + dp2->i_ino, first_block, + free_list, spaceres); + if (error) + goto out_trans_abort; + + /* transfer ip1 ".." reference to dp2 */ + if (!S_ISDIR(ip2->i_d.di_mode)) { + error = xfs_droplink(tp, dp1); + if (error) + goto out_trans_abort; + error = xfs_bumplink(tp, dp2); + if (error) + goto out_trans_abort; + } + + /* + * Although ip2 isn't changed here, userspace needs + * to be warned about the change, so that applications + * relying on it (like backup ones), will properly + * notify the change + */ + ip1_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG; + ip2_flags |= XFS_ICHGTIME_CHG; + } + } + + if (ip1_flags) { + xfs_trans_ichgtime(tp, ip1, ip1_flags); + xfs_trans_log_inode(tp, ip1, XFS_ILOG_CORE); + } + if (ip2_flags) { + xfs_trans_ichgtime(tp, ip2, ip2_flags); + xfs_trans_log_inode(tp, ip2, XFS_ILOG_CORE); + } + if (dp2_flags) { + xfs_trans_ichgtime(tp, dp2, dp2_flags); + xfs_trans_log_inode(tp, dp2, XFS_ILOG_CORE); + } + xfs_trans_ichgtime(tp, dp1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + xfs_trans_log_inode(tp, dp1, XFS_ILOG_CORE); + return xfs_finish_rename(tp, free_list); + +out_trans_abort: + xfs_bmap_cancel(free_list); + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); + return error; +} + +/* + * xfs_rename_alloc_whiteout() + * + * Return a referenced, unlinked, unlocked inode that that can be used as a + * whiteout in a rename transaction. We use a tmpfile inode here so that if we + * crash between allocating the inode and linking it into the rename transaction + * recovery will free the inode and we won't leak it. + */ +static int +xfs_rename_alloc_whiteout( + struct xfs_inode *dp, + struct xfs_inode **wip) +{ + struct xfs_inode *tmpfile; + int error; + + error = xfs_create_tmpfile(dp, NULL, S_IFCHR | WHITEOUT_MODE, &tmpfile); + if (error) + return error; + + /* + * Prepare the tmpfile inode as if it were created through the VFS. + * Otherwise, the link increment paths will complain about nlink 0->1. + * Drop the link count as done by d_tmpfile(), complete the inode setup + * and flag it as linkable. + */ + drop_nlink(VFS_I(tmpfile)); + xfs_finish_inode_setup(tmpfile); + VFS_I(tmpfile)->i_state |= I_LINKABLE; + + *wip = tmpfile; + return 0; +} + +/* + * xfs_rename + */ +int +xfs_rename( + struct xfs_inode *src_dp, + struct xfs_name *src_name, + struct xfs_inode *src_ip, + struct xfs_inode *target_dp, + struct xfs_name *target_name, + struct xfs_inode *target_ip, + unsigned int flags) +{ + struct xfs_mount *mp = src_dp->i_mount; + struct xfs_trans *tp; + struct xfs_bmap_free free_list; + xfs_fsblock_t first_block; + struct xfs_inode *wip = NULL; /* whiteout inode */ + struct xfs_inode *inodes[__XFS_SORT_INODES]; + int num_inodes = __XFS_SORT_INODES; + bool new_parent = (src_dp != target_dp); + bool src_is_directory = S_ISDIR(src_ip->i_d.di_mode); + int cancel_flags = 0; + int spaceres; + int error; + + trace_xfs_rename(src_dp, target_dp, src_name, target_name); + + if ((flags & RENAME_EXCHANGE) && !target_ip) + return -EINVAL; + + /* + * If we are doing a whiteout operation, allocate the whiteout inode + * we will be placing at the target and ensure the type is set + * appropriately. + */ + if (flags & RENAME_WHITEOUT) { + ASSERT(!(flags & (RENAME_NOREPLACE | RENAME_EXCHANGE))); + error = xfs_rename_alloc_whiteout(target_dp, &wip); + if (error) + return error; + + /* setup target dirent info as whiteout */ + src_name->type = XFS_DIR3_FT_CHRDEV; + } + + xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, wip, + inodes, &num_inodes); + + tp = xfs_trans_alloc(mp, XFS_TRANS_RENAME); + spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, spaceres, 0); + if (error == -ENOSPC) { + spaceres = 0; + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, 0, 0); + } + if (error) + goto out_trans_cancel; + cancel_flags = XFS_TRANS_RELEASE_LOG_RES; + + /* + * Attach the dquots to the inodes + */ + error = xfs_qm_vop_rename_dqattach(inodes); + if (error) + goto out_trans_cancel; + + /* + * Lock all the participating inodes. Depending upon whether + * the target_name exists in the target directory, and + * whether the target directory is the same as the source + * directory, we can lock from 2 to 4 inodes. + */ + xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL); + + /* + * Join all the inodes to the transaction. From this point on, + * we can rely on either trans_commit or trans_cancel to unlock + * them. + */ + xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL); + if (new_parent) + xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL); + if (target_ip) + xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL); + if (wip) + xfs_trans_ijoin(tp, wip, XFS_ILOCK_EXCL); + + /* + * If we are using project inheritance, we only allow renames + * into our tree when the project IDs are the same; else the + * tree quota mechanism would be circumvented. + */ + if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && + (xfs_get_projid(target_dp) != xfs_get_projid(src_ip)))) { + error = -EXDEV; + goto out_trans_cancel; + } + + xfs_bmap_init(&free_list, &first_block); + + /* RENAME_EXCHANGE is unique from here on. */ + if (flags & RENAME_EXCHANGE) + return xfs_cross_rename(tp, src_dp, src_name, src_ip, + target_dp, target_name, target_ip, + &free_list, &first_block, spaceres); + + /* + * Set up the target. + */ + if (target_ip == NULL) { + /* + * If there's no space reservation, check the entry will + * fit before actually inserting it. + */ + if (!spaceres) { + error = xfs_dir_canenter(tp, target_dp, target_name); + if (error) + goto out_trans_cancel; + } + /* + * If target does not exist and the rename crosses + * directories, adjust the target directory link count + * to account for the ".." reference from the new entry. + */ + error = xfs_dir_createname(tp, target_dp, target_name, + src_ip->i_ino, &first_block, + &free_list, spaceres); + if (error == -ENOSPC) + goto out_bmap_cancel; + if (error) + goto out_trans_abort; + + xfs_trans_ichgtime(tp, target_dp, + XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + + if (new_parent && src_is_directory) { + error = xfs_bumplink(tp, target_dp); + if (error) + goto out_trans_abort; + } + } else { /* target_ip != NULL */ + /* + * If target exists and it's a directory, check that both + * target and source are directories and that target can be + * destroyed, or that neither is a directory. + */ + if (S_ISDIR(target_ip->i_d.di_mode)) { + /* + * Make sure target dir is empty. + */ + if (!(xfs_dir_isempty(target_ip)) || + (target_ip->i_d.di_nlink > 2)) { + error = -EEXIST; + goto out_trans_cancel; + } + } + + /* + * Link the source inode under the target name. + * If the source inode is a directory and we are moving + * it across directories, its ".." entry will be + * inconsistent until we replace that down below. + * + * In case there is already an entry with the same + * name at the destination directory, remove it first. + */ + error = xfs_dir_replace(tp, target_dp, target_name, + src_ip->i_ino, + &first_block, &free_list, spaceres); + if (error) + goto out_trans_abort; + + xfs_trans_ichgtime(tp, target_dp, + XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + + /* + * Decrement the link count on the target since the target + * dir no longer points to it. + */ + error = xfs_droplink(tp, target_ip); + if (error) + goto out_trans_abort; + + if (src_is_directory) { + /* + * Drop the link from the old "." entry. + */ + error = xfs_droplink(tp, target_ip); + if (error) + goto out_trans_abort; + } + } /* target_ip != NULL */ + + /* + * Remove the source. + */ + if (new_parent && src_is_directory) { + /* + * Rewrite the ".." entry to point to the new + * directory. + */ + error = xfs_dir_replace(tp, src_ip, &xfs_name_dotdot, + target_dp->i_ino, + &first_block, &free_list, spaceres); + ASSERT(error != -EEXIST); + if (error) + goto out_trans_abort; + } + + /* + * We always want to hit the ctime on the source inode. + * + * This isn't strictly required by the standards since the source + * inode isn't really being changed, but old unix file systems did + * it and some incremental backup programs won't work without it. + */ + xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG); + xfs_trans_log_inode(tp, src_ip, XFS_ILOG_CORE); + + /* + * Adjust the link count on src_dp. This is necessary when + * renaming a directory, either within one parent when + * the target existed, or across two parent directories. + */ + if (src_is_directory && (new_parent || target_ip != NULL)) { + + /* + * Decrement link count on src_directory since the + * entry that's moved no longer points to it. + */ + error = xfs_droplink(tp, src_dp); + if (error) + goto out_trans_abort; + } + + /* + * For whiteouts, we only need to update the source dirent with the + * inode number of the whiteout inode rather than removing it + * altogether. + */ + if (wip) { + error = xfs_dir_replace(tp, src_dp, src_name, wip->i_ino, + &first_block, &free_list, spaceres); + } else + error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino, + &first_block, &free_list, spaceres); + if (error) + goto out_trans_abort; + + /* + * For whiteouts, we need to bump the link count on the whiteout inode. + * This means that failures all the way up to this point leave the inode + * on the unlinked list and so cleanup is a simple matter of dropping + * the remaining reference to it. If we fail here after bumping the link + * count, we're shutting down the filesystem so we'll never see the + * intermediate state on disk. + */ + if (wip) { + ASSERT(VFS_I(wip)->i_nlink == 0 && wip->i_d.di_nlink == 0); + error = xfs_bumplink(tp, wip); + if (error) + goto out_trans_abort; + error = xfs_iunlink_remove(tp, wip); + if (error) + goto out_trans_abort; + xfs_trans_log_inode(tp, wip, XFS_ILOG_CORE); + + /* + * Now we have a real link, clear the "I'm a tmpfile" state + * flag from the inode so it doesn't accidentally get misused in + * future. + */ + VFS_I(wip)->i_state &= ~I_LINKABLE; + } + + xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE); + if (new_parent) + xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE); + + error = xfs_finish_rename(tp, &free_list); + if (wip) + IRELE(wip); + return error; + +out_trans_abort: + cancel_flags |= XFS_TRANS_ABORT; +out_bmap_cancel: + xfs_bmap_cancel(&free_list); +out_trans_cancel: + xfs_trans_cancel(tp, cancel_flags); + if (wip) + IRELE(wip); + return error; +} + +STATIC int +xfs_iflush_cluster( + xfs_inode_t *ip, + xfs_buf_t *bp) +{ + xfs_mount_t *mp = ip->i_mount; + struct xfs_perag *pag; + unsigned long first_index, mask; + unsigned long inodes_per_cluster; + int ilist_size; + xfs_inode_t **ilist; + xfs_inode_t *iq; + int nr_found; + int clcount = 0; + int bufwasdelwri; + int i; + + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); + + inodes_per_cluster = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog; + ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *); + ilist = kmem_alloc(ilist_size, KM_MAYFAIL|KM_NOFS); + if (!ilist) + goto out_put; + + mask = ~(((mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog)) - 1); + first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask; + rcu_read_lock(); + /* really need a gang lookup range call here */ + nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist, + first_index, inodes_per_cluster); + if (nr_found == 0) + goto out_free; + + for (i = 0; i < nr_found; i++) { + iq = ilist[i]; + if (iq == ip) + continue; + + /* + * because this is an RCU protected lookup, we could find a + * recently freed or even reallocated inode during the lookup. + * We need to check under the i_flags_lock for a valid inode + * here. Skip it if it is not valid or the wrong inode. + */ + spin_lock(&ip->i_flags_lock); + if (!ip->i_ino || + (XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) { + spin_unlock(&ip->i_flags_lock); + continue; + } + spin_unlock(&ip->i_flags_lock); + + /* + * Do an un-protected check to see if the inode is dirty and + * is a candidate for flushing. These checks will be repeated + * later after the appropriate locks are acquired. + */ + if (xfs_inode_clean(iq) && xfs_ipincount(iq) == 0) + continue; + + /* + * Try to get locks. If any are unavailable or it is pinned, + * then this inode cannot be flushed and is skipped. + */ + + if (!xfs_ilock_nowait(iq, XFS_ILOCK_SHARED)) + continue; + if (!xfs_iflock_nowait(iq)) { + xfs_iunlock(iq, XFS_ILOCK_SHARED); + continue; + } + if (xfs_ipincount(iq)) { + xfs_ifunlock(iq); + xfs_iunlock(iq, XFS_ILOCK_SHARED); + continue; + } + + /* + * arriving here means that this inode can be flushed. First + * re-check that it's dirty before flushing. + */ + if (!xfs_inode_clean(iq)) { + int error; + error = xfs_iflush_int(iq, bp); + if (error) { + xfs_iunlock(iq, XFS_ILOCK_SHARED); + goto cluster_corrupt_out; + } + clcount++; + } else { + xfs_ifunlock(iq); + } + xfs_iunlock(iq, XFS_ILOCK_SHARED); + } + + if (clcount) { + XFS_STATS_INC(xs_icluster_flushcnt); + XFS_STATS_ADD(xs_icluster_flushinode, clcount); + } + +out_free: + rcu_read_unlock(); + kmem_free(ilist); +out_put: + xfs_perag_put(pag); + return 0; + + +cluster_corrupt_out: + /* + * Corruption detected in the clustering loop. Invalidate the + * inode buffer and shut down the filesystem. + */ + rcu_read_unlock(); + /* + * Clean up the buffer. If it was delwri, just release it -- + * brelse can handle it with no problems. If not, shut down the + * filesystem before releasing the buffer. + */ + bufwasdelwri = (bp->b_flags & _XBF_DELWRI_Q); + if (bufwasdelwri) + xfs_buf_relse(bp); + + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + + if (!bufwasdelwri) { + /* + * Just like incore_relse: if we have b_iodone functions, + * mark the buffer as an error and call them. Otherwise + * mark it as stale and brelse. + */ + if (bp->b_iodone) { + XFS_BUF_UNDONE(bp); + xfs_buf_stale(bp); + xfs_buf_ioerror(bp, -EIO); + xfs_buf_ioend(bp); + } else { + xfs_buf_stale(bp); + xfs_buf_relse(bp); + } + } + + /* + * Unlocks the flush lock + */ + xfs_iflush_abort(iq, false); + kmem_free(ilist); + xfs_perag_put(pag); + return -EFSCORRUPTED; +} + +/* + * Flush dirty inode metadata into the backing buffer. + * + * The caller must have the inode lock and the inode flush lock held. The + * inode lock will still be held upon return to the caller, and the inode + * flush lock will be released after the inode has reached the disk. + * + * The caller must write out the buffer returned in *bpp and release it. + */ +int +xfs_iflush( + struct xfs_inode *ip, + struct xfs_buf **bpp) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_buf *bp; + struct xfs_dinode *dip; + int error; + + XFS_STATS_INC(xs_iflush_count); + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); + ASSERT(xfs_isiflocked(ip)); + ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || + ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); + + *bpp = NULL; + + xfs_iunpin_wait(ip); + + /* + * For stale inodes we cannot rely on the backing buffer remaining + * stale in cache for the remaining life of the stale inode and so + * xfs_imap_to_bp() below may give us a buffer that no longer contains + * inodes below. We have to check this after ensuring the inode is + * unpinned so that it is safe to reclaim the stale inode after the + * flush call. + */ + if (xfs_iflags_test(ip, XFS_ISTALE)) { + xfs_ifunlock(ip); + return 0; + } + + /* + * This may have been unpinned because the filesystem is shutting + * down forcibly. If that's the case we must not write this inode + * to disk, because the log record didn't make it to disk. + * + * We also have to remove the log item from the AIL in this case, + * as we wait for an empty AIL as part of the unmount process. + */ + if (XFS_FORCED_SHUTDOWN(mp)) { + error = -EIO; + goto abort_out; + } + + /* + * Get the buffer containing the on-disk inode. + */ + error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &bp, XBF_TRYLOCK, + 0); + if (error || !bp) { + xfs_ifunlock(ip); + return error; + } + + /* + * First flush out the inode that xfs_iflush was called with. + */ + error = xfs_iflush_int(ip, bp); + if (error) + goto corrupt_out; + + /* + * If the buffer is pinned then push on the log now so we won't + * get stuck waiting in the write for too long. + */ + if (xfs_buf_ispinned(bp)) + xfs_log_force(mp, 0); + + /* + * inode clustering: + * see if other inodes can be gathered into this write + */ + error = xfs_iflush_cluster(ip, bp); + if (error) + goto cluster_corrupt_out; + + *bpp = bp; + return 0; + +corrupt_out: + xfs_buf_relse(bp); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); +cluster_corrupt_out: + error = -EFSCORRUPTED; +abort_out: + /* + * Unlocks the flush lock + */ + xfs_iflush_abort(ip, false); + return error; +} + +STATIC int +xfs_iflush_int( + struct xfs_inode *ip, + struct xfs_buf *bp) +{ + struct xfs_inode_log_item *iip = ip->i_itemp; + struct xfs_dinode *dip; + struct xfs_mount *mp = ip->i_mount; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); + ASSERT(xfs_isiflocked(ip)); + ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || + ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); + ASSERT(iip != NULL && iip->ili_fields != 0); + ASSERT(ip->i_d.di_version > 1); + + /* set *dip = inode's place in the buffer */ + dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset); + + if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC), + mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) { + xfs_alert_tag(mp, XFS_PTAG_IFLUSH, + "%s: Bad inode %Lu magic number 0x%x, ptr 0x%p", + __func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip); + goto corrupt_out; + } + if (XFS_TEST_ERROR(ip->i_d.di_magic != XFS_DINODE_MAGIC, + mp, XFS_ERRTAG_IFLUSH_2, XFS_RANDOM_IFLUSH_2)) { + xfs_alert_tag(mp, XFS_PTAG_IFLUSH, + "%s: Bad inode %Lu, ptr 0x%p, magic number 0x%x", + __func__, ip->i_ino, ip, ip->i_d.di_magic); + goto corrupt_out; + } + if (S_ISREG(ip->i_d.di_mode)) { + if (XFS_TEST_ERROR( + (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) && + (ip->i_d.di_format != XFS_DINODE_FMT_BTREE), + mp, XFS_ERRTAG_IFLUSH_3, XFS_RANDOM_IFLUSH_3)) { + xfs_alert_tag(mp, XFS_PTAG_IFLUSH, + "%s: Bad regular inode %Lu, ptr 0x%p", + __func__, ip->i_ino, ip); + goto corrupt_out; + } + } else if (S_ISDIR(ip->i_d.di_mode)) { + if (XFS_TEST_ERROR( + (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) && + (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) && + (ip->i_d.di_format != XFS_DINODE_FMT_LOCAL), + mp, XFS_ERRTAG_IFLUSH_4, XFS_RANDOM_IFLUSH_4)) { + xfs_alert_tag(mp, XFS_PTAG_IFLUSH, + "%s: Bad directory inode %Lu, ptr 0x%p", + __func__, ip->i_ino, ip); + goto corrupt_out; + } + } + if (XFS_TEST_ERROR(ip->i_d.di_nextents + ip->i_d.di_anextents > + ip->i_d.di_nblocks, mp, XFS_ERRTAG_IFLUSH_5, + XFS_RANDOM_IFLUSH_5)) { + xfs_alert_tag(mp, XFS_PTAG_IFLUSH, + "%s: detected corrupt incore inode %Lu, " + "total extents = %d, nblocks = %Ld, ptr 0x%p", + __func__, ip->i_ino, + ip->i_d.di_nextents + ip->i_d.di_anextents, + ip->i_d.di_nblocks, ip); + goto corrupt_out; + } + if (XFS_TEST_ERROR(ip->i_d.di_forkoff > mp->m_sb.sb_inodesize, + mp, XFS_ERRTAG_IFLUSH_6, XFS_RANDOM_IFLUSH_6)) { + xfs_alert_tag(mp, XFS_PTAG_IFLUSH, + "%s: bad inode %Lu, forkoff 0x%x, ptr 0x%p", + __func__, ip->i_ino, ip->i_d.di_forkoff, ip); + goto corrupt_out; + } + + /* + * Inode item log recovery for v2 inodes are dependent on the + * di_flushiter count for correct sequencing. We bump the flush + * iteration count so we can detect flushes which postdate a log record + * during recovery. This is redundant as we now log every change and + * hence this can't happen but we need to still do it to ensure + * backwards compatibility with old kernels that predate logging all + * inode changes. + */ + if (ip->i_d.di_version < 3) + ip->i_d.di_flushiter++; + + /* + * Copy the dirty parts of the inode into the on-disk + * inode. We always copy out the core of the inode, + * because if the inode is dirty at all the core must + * be. + */ + xfs_dinode_to_disk(dip, &ip->i_d); + + /* Wrap, we never let the log put out DI_MAX_FLUSH */ + if (ip->i_d.di_flushiter == DI_MAX_FLUSH) + ip->i_d.di_flushiter = 0; + + xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK); + if (XFS_IFORK_Q(ip)) + xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK); + xfs_inobp_check(mp, bp); + + /* + * We've recorded everything logged in the inode, so we'd like to clear + * the ili_fields bits so we don't log and flush things unnecessarily. + * However, we can't stop logging all this information until the data + * we've copied into the disk buffer is written to disk. If we did we + * might overwrite the copy of the inode in the log with all the data + * after re-logging only part of it, and in the face of a crash we + * wouldn't have all the data we need to recover. + * + * What we do is move the bits to the ili_last_fields field. When + * logging the inode, these bits are moved back to the ili_fields field. + * In the xfs_iflush_done() routine we clear ili_last_fields, since we + * know that the information those bits represent is permanently on + * disk. As long as the flush completes before the inode is logged + * again, then both ili_fields and ili_last_fields will be cleared. + * + * We can play with the ili_fields bits here, because the inode lock + * must be held exclusively in order to set bits there and the flush + * lock protects the ili_last_fields bits. Set ili_logged so the flush + * done routine can tell whether or not to look in the AIL. Also, store + * the current LSN of the inode so that we can tell whether the item has + * moved in the AIL from xfs_iflush_done(). In order to read the lsn we + * need the AIL lock, because it is a 64 bit value that cannot be read + * atomically. + */ + iip->ili_last_fields = iip->ili_fields; + iip->ili_fields = 0; + iip->ili_logged = 1; + + xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, + &iip->ili_item.li_lsn); + + /* + * Attach the function xfs_iflush_done to the inode's + * buffer. This will remove the inode from the AIL + * and unlock the inode's flush lock when the inode is + * completely written to disk. + */ + xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item); + + /* update the lsn in the on disk inode if required */ + if (ip->i_d.di_version == 3) + dip->di_lsn = cpu_to_be64(iip->ili_item.li_lsn); + + /* generate the checksum. */ + xfs_dinode_calc_crc(mp, dip); + + ASSERT(bp->b_fspriv != NULL); + ASSERT(bp->b_iodone != NULL); + return 0; + +corrupt_out: + return -EFSCORRUPTED; +} diff --git a/kernel/fs/xfs/xfs_inode.h b/kernel/fs/xfs/xfs_inode.h new file mode 100644 index 000000000..8f22d2036 --- /dev/null +++ b/kernel/fs/xfs/xfs_inode.h @@ -0,0 +1,456 @@ +/* + * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_INODE_H__ +#define __XFS_INODE_H__ + +#include "xfs_inode_buf.h" +#include "xfs_inode_fork.h" + +/* + * Kernel only inode definitions + */ +struct xfs_dinode; +struct xfs_inode; +struct xfs_buf; +struct xfs_bmap_free; +struct xfs_bmbt_irec; +struct xfs_inode_log_item; +struct xfs_mount; +struct xfs_trans; +struct xfs_dquot; + +typedef struct xfs_inode { + /* Inode linking and identification information. */ + struct xfs_mount *i_mount; /* fs mount struct ptr */ + struct xfs_dquot *i_udquot; /* user dquot */ + struct xfs_dquot *i_gdquot; /* group dquot */ + struct xfs_dquot *i_pdquot; /* project dquot */ + + /* Inode location stuff */ + xfs_ino_t i_ino; /* inode number (agno/agino)*/ + struct xfs_imap i_imap; /* location for xfs_imap() */ + + /* Extent information. */ + xfs_ifork_t *i_afp; /* attribute fork pointer */ + xfs_ifork_t i_df; /* data fork */ + + /* operations vectors */ + const struct xfs_dir_ops *d_ops; /* directory ops vector */ + + /* Transaction and locking information. */ + struct xfs_inode_log_item *i_itemp; /* logging information */ + mrlock_t i_lock; /* inode lock */ + mrlock_t i_iolock; /* inode IO lock */ + mrlock_t i_mmaplock; /* inode mmap IO lock */ + atomic_t i_pincount; /* inode pin count */ + spinlock_t i_flags_lock; /* inode i_flags lock */ + /* Miscellaneous state. */ + unsigned long i_flags; /* see defined flags below */ + unsigned int i_delayed_blks; /* count of delay alloc blks */ + + xfs_icdinode_t i_d; /* most of ondisk inode */ + + /* VFS inode */ + struct inode i_vnode; /* embedded VFS inode */ +} xfs_inode_t; + +/* Convert from vfs inode to xfs inode */ +static inline struct xfs_inode *XFS_I(struct inode *inode) +{ + return container_of(inode, struct xfs_inode, i_vnode); +} + +/* convert from xfs inode to vfs inode */ +static inline struct inode *VFS_I(struct xfs_inode *ip) +{ + return &ip->i_vnode; +} + +/* + * For regular files we only update the on-disk filesize when actually + * writing data back to disk. Until then only the copy in the VFS inode + * is uptodate. + */ +static inline xfs_fsize_t XFS_ISIZE(struct xfs_inode *ip) +{ + if (S_ISREG(ip->i_d.di_mode)) + return i_size_read(VFS_I(ip)); + return ip->i_d.di_size; +} + +/* + * If this I/O goes past the on-disk inode size update it unless it would + * be past the current in-core inode size. + */ +static inline xfs_fsize_t +xfs_new_eof(struct xfs_inode *ip, xfs_fsize_t new_size) +{ + xfs_fsize_t i_size = i_size_read(VFS_I(ip)); + + if (new_size > i_size || new_size < 0) + new_size = i_size; + return new_size > ip->i_d.di_size ? new_size : 0; +} + +/* + * i_flags helper functions + */ +static inline void +__xfs_iflags_set(xfs_inode_t *ip, unsigned short flags) +{ + ip->i_flags |= flags; +} + +static inline void +xfs_iflags_set(xfs_inode_t *ip, unsigned short flags) +{ + spin_lock(&ip->i_flags_lock); + __xfs_iflags_set(ip, flags); + spin_unlock(&ip->i_flags_lock); +} + +static inline void +xfs_iflags_clear(xfs_inode_t *ip, unsigned short flags) +{ + spin_lock(&ip->i_flags_lock); + ip->i_flags &= ~flags; + spin_unlock(&ip->i_flags_lock); +} + +static inline int +__xfs_iflags_test(xfs_inode_t *ip, unsigned short flags) +{ + return (ip->i_flags & flags); +} + +static inline int +xfs_iflags_test(xfs_inode_t *ip, unsigned short flags) +{ + int ret; + spin_lock(&ip->i_flags_lock); + ret = __xfs_iflags_test(ip, flags); + spin_unlock(&ip->i_flags_lock); + return ret; +} + +static inline int +xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags) +{ + int ret; + + spin_lock(&ip->i_flags_lock); + ret = ip->i_flags & flags; + if (ret) + ip->i_flags &= ~flags; + spin_unlock(&ip->i_flags_lock); + return ret; +} + +static inline int +xfs_iflags_test_and_set(xfs_inode_t *ip, unsigned short flags) +{ + int ret; + + spin_lock(&ip->i_flags_lock); + ret = ip->i_flags & flags; + if (!ret) + ip->i_flags |= flags; + spin_unlock(&ip->i_flags_lock); + return ret; +} + +/* + * Project quota id helpers (previously projid was 16bit only + * and using two 16bit values to hold new 32bit projid was chosen + * to retain compatibility with "old" filesystems). + */ +static inline prid_t +xfs_get_projid(struct xfs_inode *ip) +{ + return (prid_t)ip->i_d.di_projid_hi << 16 | ip->i_d.di_projid_lo; +} + +static inline void +xfs_set_projid(struct xfs_inode *ip, + prid_t projid) +{ + ip->i_d.di_projid_hi = (__uint16_t) (projid >> 16); + ip->i_d.di_projid_lo = (__uint16_t) (projid & 0xffff); +} + +static inline prid_t +xfs_get_initial_prid(struct xfs_inode *dp) +{ + if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) + return xfs_get_projid(dp); + + return XFS_PROJID_DEFAULT; +} + +/* + * In-core inode flags. + */ +#define XFS_IRECLAIM (1 << 0) /* started reclaiming this inode */ +#define XFS_ISTALE (1 << 1) /* inode has been staled */ +#define XFS_IRECLAIMABLE (1 << 2) /* inode can be reclaimed */ +#define XFS_INEW (1 << 3) /* inode has just been allocated */ +#define XFS_ITRUNCATED (1 << 5) /* truncated down so flush-on-close */ +#define XFS_IDIRTY_RELEASE (1 << 6) /* dirty release already seen */ +#define __XFS_IFLOCK_BIT 7 /* inode is being flushed right now */ +#define XFS_IFLOCK (1 << __XFS_IFLOCK_BIT) +#define __XFS_IPINNED_BIT 8 /* wakeup key for zero pin count */ +#define XFS_IPINNED (1 << __XFS_IPINNED_BIT) +#define XFS_IDONTCACHE (1 << 9) /* don't cache the inode long term */ + +/* + * Per-lifetime flags need to be reset when re-using a reclaimable inode during + * inode lookup. This prevents unintended behaviour on the new inode from + * ocurring. + */ +#define XFS_IRECLAIM_RESET_FLAGS \ + (XFS_IRECLAIMABLE | XFS_IRECLAIM | \ + XFS_IDIRTY_RELEASE | XFS_ITRUNCATED) + +/* + * Synchronize processes attempting to flush the in-core inode back to disk. + */ + +extern void __xfs_iflock(struct xfs_inode *ip); + +static inline int xfs_iflock_nowait(struct xfs_inode *ip) +{ + return !xfs_iflags_test_and_set(ip, XFS_IFLOCK); +} + +static inline void xfs_iflock(struct xfs_inode *ip) +{ + if (!xfs_iflock_nowait(ip)) + __xfs_iflock(ip); +} + +static inline void xfs_ifunlock(struct xfs_inode *ip) +{ + xfs_iflags_clear(ip, XFS_IFLOCK); + smp_mb(); + wake_up_bit(&ip->i_flags, __XFS_IFLOCK_BIT); +} + +static inline int xfs_isiflocked(struct xfs_inode *ip) +{ + return xfs_iflags_test(ip, XFS_IFLOCK); +} + +/* + * Flags for inode locking. + * Bit ranges: 1<<1 - 1<<16-1 -- iolock/ilock modes (bitfield) + * 1<<16 - 1<<32-1 -- lockdep annotation (integers) + */ +#define XFS_IOLOCK_EXCL (1<<0) +#define XFS_IOLOCK_SHARED (1<<1) +#define XFS_ILOCK_EXCL (1<<2) +#define XFS_ILOCK_SHARED (1<<3) +#define XFS_MMAPLOCK_EXCL (1<<4) +#define XFS_MMAPLOCK_SHARED (1<<5) + +#define XFS_LOCK_MASK (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED \ + | XFS_ILOCK_EXCL | XFS_ILOCK_SHARED \ + | XFS_MMAPLOCK_EXCL | XFS_MMAPLOCK_SHARED) + +#define XFS_LOCK_FLAGS \ + { XFS_IOLOCK_EXCL, "IOLOCK_EXCL" }, \ + { XFS_IOLOCK_SHARED, "IOLOCK_SHARED" }, \ + { XFS_ILOCK_EXCL, "ILOCK_EXCL" }, \ + { XFS_ILOCK_SHARED, "ILOCK_SHARED" }, \ + { XFS_MMAPLOCK_EXCL, "MMAPLOCK_EXCL" }, \ + { XFS_MMAPLOCK_SHARED, "MMAPLOCK_SHARED" } + + +/* + * Flags for lockdep annotations. + * + * XFS_LOCK_PARENT - for directory operations that require locking a + * parent directory inode and a child entry inode. The parent gets locked + * with this flag so it gets a lockdep subclass of 1 and the child entry + * lock will have a lockdep subclass of 0. + * + * XFS_LOCK_RTBITMAP/XFS_LOCK_RTSUM - the realtime device bitmap and summary + * inodes do not participate in the normal lock order, and thus have their + * own subclasses. + * + * XFS_LOCK_INUMORDER - for locking several inodes at the some time + * with xfs_lock_inodes(). This flag is used as the starting subclass + * and each subsequent lock acquired will increment the subclass by one. + * So the first lock acquired will have a lockdep subclass of 4, the + * second lock will have a lockdep subclass of 5, and so on. It is + * the responsibility of the class builder to shift this to the correct + * portion of the lock_mode lockdep mask. + */ +#define XFS_LOCK_PARENT 1 +#define XFS_LOCK_RTBITMAP 2 +#define XFS_LOCK_RTSUM 3 +#define XFS_LOCK_INUMORDER 4 + +#define XFS_IOLOCK_SHIFT 16 +#define XFS_IOLOCK_PARENT (XFS_LOCK_PARENT << XFS_IOLOCK_SHIFT) + +#define XFS_MMAPLOCK_SHIFT 20 + +#define XFS_ILOCK_SHIFT 24 +#define XFS_ILOCK_PARENT (XFS_LOCK_PARENT << XFS_ILOCK_SHIFT) +#define XFS_ILOCK_RTBITMAP (XFS_LOCK_RTBITMAP << XFS_ILOCK_SHIFT) +#define XFS_ILOCK_RTSUM (XFS_LOCK_RTSUM << XFS_ILOCK_SHIFT) + +#define XFS_IOLOCK_DEP_MASK 0x000f0000 +#define XFS_MMAPLOCK_DEP_MASK 0x00f00000 +#define XFS_ILOCK_DEP_MASK 0xff000000 +#define XFS_LOCK_DEP_MASK (XFS_IOLOCK_DEP_MASK | \ + XFS_MMAPLOCK_DEP_MASK | \ + XFS_ILOCK_DEP_MASK) + +#define XFS_IOLOCK_DEP(flags) (((flags) & XFS_IOLOCK_DEP_MASK) \ + >> XFS_IOLOCK_SHIFT) +#define XFS_MMAPLOCK_DEP(flags) (((flags) & XFS_MMAPLOCK_DEP_MASK) \ + >> XFS_MMAPLOCK_SHIFT) +#define XFS_ILOCK_DEP(flags) (((flags) & XFS_ILOCK_DEP_MASK) \ + >> XFS_ILOCK_SHIFT) + +/* + * For multiple groups support: if S_ISGID bit is set in the parent + * directory, group of new file is set to that of the parent, and + * new subdirectory gets S_ISGID bit from parent. + */ +#define XFS_INHERIT_GID(pip) \ + (((pip)->i_mount->m_flags & XFS_MOUNT_GRPID) || \ + ((pip)->i_d.di_mode & S_ISGID)) + +int xfs_release(struct xfs_inode *ip); +void xfs_inactive(struct xfs_inode *ip); +int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name, + struct xfs_inode **ipp, struct xfs_name *ci_name); +int xfs_create(struct xfs_inode *dp, struct xfs_name *name, + umode_t mode, xfs_dev_t rdev, struct xfs_inode **ipp); +int xfs_create_tmpfile(struct xfs_inode *dp, struct dentry *dentry, + umode_t mode, struct xfs_inode **ipp); +int xfs_remove(struct xfs_inode *dp, struct xfs_name *name, + struct xfs_inode *ip); +int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip, + struct xfs_name *target_name); +int xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name, + struct xfs_inode *src_ip, struct xfs_inode *target_dp, + struct xfs_name *target_name, + struct xfs_inode *target_ip, unsigned int flags); + +void xfs_ilock(xfs_inode_t *, uint); +int xfs_ilock_nowait(xfs_inode_t *, uint); +void xfs_iunlock(xfs_inode_t *, uint); +void xfs_ilock_demote(xfs_inode_t *, uint); +int xfs_isilocked(xfs_inode_t *, uint); +uint xfs_ilock_data_map_shared(struct xfs_inode *); +uint xfs_ilock_attr_map_shared(struct xfs_inode *); +int xfs_ialloc(struct xfs_trans *, xfs_inode_t *, umode_t, + xfs_nlink_t, xfs_dev_t, prid_t, int, + struct xfs_buf **, xfs_inode_t **); + +uint xfs_ip2xflags(struct xfs_inode *); +uint xfs_dic2xflags(struct xfs_dinode *); +int xfs_ifree(struct xfs_trans *, xfs_inode_t *, + struct xfs_bmap_free *); +int xfs_itruncate_extents(struct xfs_trans **, struct xfs_inode *, + int, xfs_fsize_t); +int xfs_iunlink(struct xfs_trans *, xfs_inode_t *); + +void xfs_iext_realloc(xfs_inode_t *, int, int); + +void xfs_iunpin_wait(xfs_inode_t *); +#define xfs_ipincount(ip) ((unsigned int) atomic_read(&ip->i_pincount)) + +int xfs_iflush(struct xfs_inode *, struct xfs_buf **); +void xfs_lock_inodes(xfs_inode_t **, int, uint); +void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint); + +xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip); + +int xfs_dir_ialloc(struct xfs_trans **, struct xfs_inode *, umode_t, + xfs_nlink_t, xfs_dev_t, prid_t, int, + struct xfs_inode **, int *); +int xfs_droplink(struct xfs_trans *, struct xfs_inode *); +int xfs_bumplink(struct xfs_trans *, struct xfs_inode *); + +/* from xfs_file.c */ +enum xfs_prealloc_flags { + XFS_PREALLOC_SET = (1 << 1), + XFS_PREALLOC_CLEAR = (1 << 2), + XFS_PREALLOC_SYNC = (1 << 3), + XFS_PREALLOC_INVISIBLE = (1 << 4), +}; + +int xfs_update_prealloc_flags(struct xfs_inode *ip, + enum xfs_prealloc_flags flags); +int xfs_zero_eof(struct xfs_inode *ip, xfs_off_t offset, + xfs_fsize_t isize, bool *did_zeroing); +int xfs_iozero(struct xfs_inode *ip, loff_t pos, size_t count); + + +/* from xfs_iops.c */ +/* + * When setting up a newly allocated inode, we need to call + * xfs_finish_inode_setup() once the inode is fully instantiated at + * the VFS level to prevent the rest of the world seeing the inode + * before we've completed instantiation. Otherwise we can do it + * the moment the inode lookup is complete. + */ +extern void xfs_setup_inode(struct xfs_inode *ip); +static inline void xfs_finish_inode_setup(struct xfs_inode *ip) +{ + xfs_iflags_clear(ip, XFS_INEW); + barrier(); + unlock_new_inode(VFS_I(ip)); +} + +static inline void xfs_setup_existing_inode(struct xfs_inode *ip) +{ + xfs_setup_inode(ip); + xfs_finish_inode_setup(ip); +} + +#define IHOLD(ip) \ +do { \ + ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \ + ihold(VFS_I(ip)); \ + trace_xfs_ihold(ip, _THIS_IP_); \ +} while (0) + +#define IRELE(ip) \ +do { \ + trace_xfs_irele(ip, _THIS_IP_); \ + iput(VFS_I(ip)); \ +} while (0) + +extern struct kmem_zone *xfs_inode_zone; + +/* + * Flags for read/write calls + */ +#define XFS_IO_ISDIRECT 0x00001 /* bypass page cache */ +#define XFS_IO_INVIS 0x00002 /* don't update inode timestamps */ + +#define XFS_IO_FLAGS \ + { XFS_IO_ISDIRECT, "DIRECT" }, \ + { XFS_IO_INVIS, "INVIS"} + +#endif /* __XFS_INODE_H__ */ diff --git a/kernel/fs/xfs/xfs_inode_item.c b/kernel/fs/xfs/xfs_inode_item.c new file mode 100644 index 000000000..bf13a5a7e --- /dev/null +++ b/kernel/fs/xfs/xfs_inode_item.c @@ -0,0 +1,789 @@ +/* + * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_inode_item.h" +#include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_trans_priv.h" +#include "xfs_log.h" + + +kmem_zone_t *xfs_ili_zone; /* inode log item zone */ + +static inline struct xfs_inode_log_item *INODE_ITEM(struct xfs_log_item *lip) +{ + return container_of(lip, struct xfs_inode_log_item, ili_item); +} + +STATIC void +xfs_inode_item_data_fork_size( + struct xfs_inode_log_item *iip, + int *nvecs, + int *nbytes) +{ + struct xfs_inode *ip = iip->ili_inode; + + switch (ip->i_d.di_format) { + case XFS_DINODE_FMT_EXTENTS: + if ((iip->ili_fields & XFS_ILOG_DEXT) && + ip->i_d.di_nextents > 0 && + ip->i_df.if_bytes > 0) { + /* worst case, doesn't subtract delalloc extents */ + *nbytes += XFS_IFORK_DSIZE(ip); + *nvecs += 1; + } + break; + case XFS_DINODE_FMT_BTREE: + if ((iip->ili_fields & XFS_ILOG_DBROOT) && + ip->i_df.if_broot_bytes > 0) { + *nbytes += ip->i_df.if_broot_bytes; + *nvecs += 1; + } + break; + case XFS_DINODE_FMT_LOCAL: + if ((iip->ili_fields & XFS_ILOG_DDATA) && + ip->i_df.if_bytes > 0) { + *nbytes += roundup(ip->i_df.if_bytes, 4); + *nvecs += 1; + } + break; + + case XFS_DINODE_FMT_DEV: + case XFS_DINODE_FMT_UUID: + break; + default: + ASSERT(0); + break; + } +} + +STATIC void +xfs_inode_item_attr_fork_size( + struct xfs_inode_log_item *iip, + int *nvecs, + int *nbytes) +{ + struct xfs_inode *ip = iip->ili_inode; + + switch (ip->i_d.di_aformat) { + case XFS_DINODE_FMT_EXTENTS: + if ((iip->ili_fields & XFS_ILOG_AEXT) && + ip->i_d.di_anextents > 0 && + ip->i_afp->if_bytes > 0) { + /* worst case, doesn't subtract unused space */ + *nbytes += XFS_IFORK_ASIZE(ip); + *nvecs += 1; + } + break; + case XFS_DINODE_FMT_BTREE: + if ((iip->ili_fields & XFS_ILOG_ABROOT) && + ip->i_afp->if_broot_bytes > 0) { + *nbytes += ip->i_afp->if_broot_bytes; + *nvecs += 1; + } + break; + case XFS_DINODE_FMT_LOCAL: + if ((iip->ili_fields & XFS_ILOG_ADATA) && + ip->i_afp->if_bytes > 0) { + *nbytes += roundup(ip->i_afp->if_bytes, 4); + *nvecs += 1; + } + break; + default: + ASSERT(0); + break; + } +} + +/* + * This returns the number of iovecs needed to log the given inode item. + * + * We need one iovec for the inode log format structure, one for the + * inode core, and possibly one for the inode data/extents/b-tree root + * and one for the inode attribute data/extents/b-tree root. + */ +STATIC void +xfs_inode_item_size( + struct xfs_log_item *lip, + int *nvecs, + int *nbytes) +{ + struct xfs_inode_log_item *iip = INODE_ITEM(lip); + struct xfs_inode *ip = iip->ili_inode; + + *nvecs += 2; + *nbytes += sizeof(struct xfs_inode_log_format) + + xfs_icdinode_size(ip->i_d.di_version); + + xfs_inode_item_data_fork_size(iip, nvecs, nbytes); + if (XFS_IFORK_Q(ip)) + xfs_inode_item_attr_fork_size(iip, nvecs, nbytes); +} + +STATIC void +xfs_inode_item_format_data_fork( + struct xfs_inode_log_item *iip, + struct xfs_inode_log_format *ilf, + struct xfs_log_vec *lv, + struct xfs_log_iovec **vecp) +{ + struct xfs_inode *ip = iip->ili_inode; + size_t data_bytes; + + switch (ip->i_d.di_format) { + case XFS_DINODE_FMT_EXTENTS: + iip->ili_fields &= + ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT | + XFS_ILOG_DEV | XFS_ILOG_UUID); + + if ((iip->ili_fields & XFS_ILOG_DEXT) && + ip->i_d.di_nextents > 0 && + ip->i_df.if_bytes > 0) { + struct xfs_bmbt_rec *p; + + ASSERT(ip->i_df.if_u1.if_extents != NULL); + ASSERT(ip->i_df.if_bytes / sizeof(xfs_bmbt_rec_t) > 0); + + p = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_IEXT); + data_bytes = xfs_iextents_copy(ip, p, XFS_DATA_FORK); + xlog_finish_iovec(lv, *vecp, data_bytes); + + ASSERT(data_bytes <= ip->i_df.if_bytes); + + ilf->ilf_dsize = data_bytes; + ilf->ilf_size++; + } else { + iip->ili_fields &= ~XFS_ILOG_DEXT; + } + break; + case XFS_DINODE_FMT_BTREE: + iip->ili_fields &= + ~(XFS_ILOG_DDATA | XFS_ILOG_DEXT | + XFS_ILOG_DEV | XFS_ILOG_UUID); + + if ((iip->ili_fields & XFS_ILOG_DBROOT) && + ip->i_df.if_broot_bytes > 0) { + ASSERT(ip->i_df.if_broot != NULL); + xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IBROOT, + ip->i_df.if_broot, + ip->i_df.if_broot_bytes); + ilf->ilf_dsize = ip->i_df.if_broot_bytes; + ilf->ilf_size++; + } else { + ASSERT(!(iip->ili_fields & + XFS_ILOG_DBROOT)); + iip->ili_fields &= ~XFS_ILOG_DBROOT; + } + break; + case XFS_DINODE_FMT_LOCAL: + iip->ili_fields &= + ~(XFS_ILOG_DEXT | XFS_ILOG_DBROOT | + XFS_ILOG_DEV | XFS_ILOG_UUID); + if ((iip->ili_fields & XFS_ILOG_DDATA) && + ip->i_df.if_bytes > 0) { + /* + * Round i_bytes up to a word boundary. + * The underlying memory is guaranteed to + * to be there by xfs_idata_realloc(). + */ + data_bytes = roundup(ip->i_df.if_bytes, 4); + ASSERT(ip->i_df.if_real_bytes == 0 || + ip->i_df.if_real_bytes == data_bytes); + ASSERT(ip->i_df.if_u1.if_data != NULL); + ASSERT(ip->i_d.di_size > 0); + xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_ILOCAL, + ip->i_df.if_u1.if_data, data_bytes); + ilf->ilf_dsize = (unsigned)data_bytes; + ilf->ilf_size++; + } else { + iip->ili_fields &= ~XFS_ILOG_DDATA; + } + break; + case XFS_DINODE_FMT_DEV: + iip->ili_fields &= + ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT | + XFS_ILOG_DEXT | XFS_ILOG_UUID); + if (iip->ili_fields & XFS_ILOG_DEV) + ilf->ilf_u.ilfu_rdev = ip->i_df.if_u2.if_rdev; + break; + case XFS_DINODE_FMT_UUID: + iip->ili_fields &= + ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT | + XFS_ILOG_DEXT | XFS_ILOG_DEV); + if (iip->ili_fields & XFS_ILOG_UUID) + ilf->ilf_u.ilfu_uuid = ip->i_df.if_u2.if_uuid; + break; + default: + ASSERT(0); + break; + } +} + +STATIC void +xfs_inode_item_format_attr_fork( + struct xfs_inode_log_item *iip, + struct xfs_inode_log_format *ilf, + struct xfs_log_vec *lv, + struct xfs_log_iovec **vecp) +{ + struct xfs_inode *ip = iip->ili_inode; + size_t data_bytes; + + switch (ip->i_d.di_aformat) { + case XFS_DINODE_FMT_EXTENTS: + iip->ili_fields &= + ~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT); + + if ((iip->ili_fields & XFS_ILOG_AEXT) && + ip->i_d.di_anextents > 0 && + ip->i_afp->if_bytes > 0) { + struct xfs_bmbt_rec *p; + + ASSERT(ip->i_afp->if_bytes / sizeof(xfs_bmbt_rec_t) == + ip->i_d.di_anextents); + ASSERT(ip->i_afp->if_u1.if_extents != NULL); + + p = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_EXT); + data_bytes = xfs_iextents_copy(ip, p, XFS_ATTR_FORK); + xlog_finish_iovec(lv, *vecp, data_bytes); + + ilf->ilf_asize = data_bytes; + ilf->ilf_size++; + } else { + iip->ili_fields &= ~XFS_ILOG_AEXT; + } + break; + case XFS_DINODE_FMT_BTREE: + iip->ili_fields &= + ~(XFS_ILOG_ADATA | XFS_ILOG_AEXT); + + if ((iip->ili_fields & XFS_ILOG_ABROOT) && + ip->i_afp->if_broot_bytes > 0) { + ASSERT(ip->i_afp->if_broot != NULL); + + xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_BROOT, + ip->i_afp->if_broot, + ip->i_afp->if_broot_bytes); + ilf->ilf_asize = ip->i_afp->if_broot_bytes; + ilf->ilf_size++; + } else { + iip->ili_fields &= ~XFS_ILOG_ABROOT; + } + break; + case XFS_DINODE_FMT_LOCAL: + iip->ili_fields &= + ~(XFS_ILOG_AEXT | XFS_ILOG_ABROOT); + + if ((iip->ili_fields & XFS_ILOG_ADATA) && + ip->i_afp->if_bytes > 0) { + /* + * Round i_bytes up to a word boundary. + * The underlying memory is guaranteed to + * to be there by xfs_idata_realloc(). + */ + data_bytes = roundup(ip->i_afp->if_bytes, 4); + ASSERT(ip->i_afp->if_real_bytes == 0 || + ip->i_afp->if_real_bytes == data_bytes); + ASSERT(ip->i_afp->if_u1.if_data != NULL); + xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_LOCAL, + ip->i_afp->if_u1.if_data, + data_bytes); + ilf->ilf_asize = (unsigned)data_bytes; + ilf->ilf_size++; + } else { + iip->ili_fields &= ~XFS_ILOG_ADATA; + } + break; + default: + ASSERT(0); + break; + } +} + +/* + * This is called to fill in the vector of log iovecs for the given inode + * log item. It fills the first item with an inode log format structure, + * the second with the on-disk inode structure, and a possible third and/or + * fourth with the inode data/extents/b-tree root and inode attributes + * data/extents/b-tree root. + */ +STATIC void +xfs_inode_item_format( + struct xfs_log_item *lip, + struct xfs_log_vec *lv) +{ + struct xfs_inode_log_item *iip = INODE_ITEM(lip); + struct xfs_inode *ip = iip->ili_inode; + struct xfs_inode_log_format *ilf; + struct xfs_log_iovec *vecp = NULL; + + ASSERT(ip->i_d.di_version > 1); + + ilf = xlog_prepare_iovec(lv, &vecp, XLOG_REG_TYPE_IFORMAT); + ilf->ilf_type = XFS_LI_INODE; + ilf->ilf_ino = ip->i_ino; + ilf->ilf_blkno = ip->i_imap.im_blkno; + ilf->ilf_len = ip->i_imap.im_len; + ilf->ilf_boffset = ip->i_imap.im_boffset; + ilf->ilf_fields = XFS_ILOG_CORE; + ilf->ilf_size = 2; /* format + core */ + xlog_finish_iovec(lv, vecp, sizeof(struct xfs_inode_log_format)); + + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ICORE, + &ip->i_d, + xfs_icdinode_size(ip->i_d.di_version)); + + xfs_inode_item_format_data_fork(iip, ilf, lv, &vecp); + if (XFS_IFORK_Q(ip)) { + xfs_inode_item_format_attr_fork(iip, ilf, lv, &vecp); + } else { + iip->ili_fields &= + ~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT); + } + + /* update the format with the exact fields we actually logged */ + ilf->ilf_fields |= (iip->ili_fields & ~XFS_ILOG_TIMESTAMP); +} + +/* + * This is called to pin the inode associated with the inode log + * item in memory so it cannot be written out. + */ +STATIC void +xfs_inode_item_pin( + struct xfs_log_item *lip) +{ + struct xfs_inode *ip = INODE_ITEM(lip)->ili_inode; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + + trace_xfs_inode_pin(ip, _RET_IP_); + atomic_inc(&ip->i_pincount); +} + + +/* + * This is called to unpin the inode associated with the inode log + * item which was previously pinned with a call to xfs_inode_item_pin(). + * + * Also wake up anyone in xfs_iunpin_wait() if the count goes to 0. + */ +STATIC void +xfs_inode_item_unpin( + struct xfs_log_item *lip, + int remove) +{ + struct xfs_inode *ip = INODE_ITEM(lip)->ili_inode; + + trace_xfs_inode_unpin(ip, _RET_IP_); + ASSERT(atomic_read(&ip->i_pincount) > 0); + if (atomic_dec_and_test(&ip->i_pincount)) + wake_up_bit(&ip->i_flags, __XFS_IPINNED_BIT); +} + +STATIC uint +xfs_inode_item_push( + struct xfs_log_item *lip, + struct list_head *buffer_list) +{ + struct xfs_inode_log_item *iip = INODE_ITEM(lip); + struct xfs_inode *ip = iip->ili_inode; + struct xfs_buf *bp = NULL; + uint rval = XFS_ITEM_SUCCESS; + int error; + + if (xfs_ipincount(ip) > 0) + return XFS_ITEM_PINNED; + + if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) + return XFS_ITEM_LOCKED; + + /* + * Re-check the pincount now that we stabilized the value by + * taking the ilock. + */ + if (xfs_ipincount(ip) > 0) { + rval = XFS_ITEM_PINNED; + goto out_unlock; + } + + /* + * Stale inode items should force out the iclog. + */ + if (ip->i_flags & XFS_ISTALE) { + rval = XFS_ITEM_PINNED; + goto out_unlock; + } + + /* + * Someone else is already flushing the inode. Nothing we can do + * here but wait for the flush to finish and remove the item from + * the AIL. + */ + if (!xfs_iflock_nowait(ip)) { + rval = XFS_ITEM_FLUSHING; + goto out_unlock; + } + + ASSERT(iip->ili_fields != 0 || XFS_FORCED_SHUTDOWN(ip->i_mount)); + ASSERT(iip->ili_logged == 0 || XFS_FORCED_SHUTDOWN(ip->i_mount)); + + spin_unlock(&lip->li_ailp->xa_lock); + + error = xfs_iflush(ip, &bp); + if (!error) { + if (!xfs_buf_delwri_queue(bp, buffer_list)) + rval = XFS_ITEM_FLUSHING; + xfs_buf_relse(bp); + } + + spin_lock(&lip->li_ailp->xa_lock); +out_unlock: + xfs_iunlock(ip, XFS_ILOCK_SHARED); + return rval; +} + +/* + * Unlock the inode associated with the inode log item. + * Clear the fields of the inode and inode log item that + * are specific to the current transaction. If the + * hold flags is set, do not unlock the inode. + */ +STATIC void +xfs_inode_item_unlock( + struct xfs_log_item *lip) +{ + struct xfs_inode_log_item *iip = INODE_ITEM(lip); + struct xfs_inode *ip = iip->ili_inode; + unsigned short lock_flags; + + ASSERT(ip->i_itemp != NULL); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + + lock_flags = iip->ili_lock_flags; + iip->ili_lock_flags = 0; + if (lock_flags) + xfs_iunlock(ip, lock_flags); +} + +/* + * This is called to find out where the oldest active copy of the inode log + * item in the on disk log resides now that the last log write of it completed + * at the given lsn. Since we always re-log all dirty data in an inode, the + * latest copy in the on disk log is the only one that matters. Therefore, + * simply return the given lsn. + * + * If the inode has been marked stale because the cluster is being freed, we + * don't want to (re-)insert this inode into the AIL. There is a race condition + * where the cluster buffer may be unpinned before the inode is inserted into + * the AIL during transaction committed processing. If the buffer is unpinned + * before the inode item has been committed and inserted, then it is possible + * for the buffer to be written and IO completes before the inode is inserted + * into the AIL. In that case, we'd be inserting a clean, stale inode into the + * AIL which will never get removed. It will, however, get reclaimed which + * triggers an assert in xfs_inode_free() complaining about freein an inode + * still in the AIL. + * + * To avoid this, just unpin the inode directly and return a LSN of -1 so the + * transaction committed code knows that it does not need to do any further + * processing on the item. + */ +STATIC xfs_lsn_t +xfs_inode_item_committed( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ + struct xfs_inode_log_item *iip = INODE_ITEM(lip); + struct xfs_inode *ip = iip->ili_inode; + + if (xfs_iflags_test(ip, XFS_ISTALE)) { + xfs_inode_item_unpin(lip, 0); + return -1; + } + return lsn; +} + +/* + * XXX rcc - this one really has to do something. Probably needs + * to stamp in a new field in the incore inode. + */ +STATIC void +xfs_inode_item_committing( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ + INODE_ITEM(lip)->ili_last_lsn = lsn; +} + +/* + * This is the ops vector shared by all buf log items. + */ +static const struct xfs_item_ops xfs_inode_item_ops = { + .iop_size = xfs_inode_item_size, + .iop_format = xfs_inode_item_format, + .iop_pin = xfs_inode_item_pin, + .iop_unpin = xfs_inode_item_unpin, + .iop_unlock = xfs_inode_item_unlock, + .iop_committed = xfs_inode_item_committed, + .iop_push = xfs_inode_item_push, + .iop_committing = xfs_inode_item_committing +}; + + +/* + * Initialize the inode log item for a newly allocated (in-core) inode. + */ +void +xfs_inode_item_init( + struct xfs_inode *ip, + struct xfs_mount *mp) +{ + struct xfs_inode_log_item *iip; + + ASSERT(ip->i_itemp == NULL); + iip = ip->i_itemp = kmem_zone_zalloc(xfs_ili_zone, KM_SLEEP); + + iip->ili_inode = ip; + xfs_log_item_init(mp, &iip->ili_item, XFS_LI_INODE, + &xfs_inode_item_ops); +} + +/* + * Free the inode log item and any memory hanging off of it. + */ +void +xfs_inode_item_destroy( + xfs_inode_t *ip) +{ + kmem_zone_free(xfs_ili_zone, ip->i_itemp); +} + + +/* + * This is the inode flushing I/O completion routine. It is called + * from interrupt level when the buffer containing the inode is + * flushed to disk. It is responsible for removing the inode item + * from the AIL if it has not been re-logged, and unlocking the inode's + * flush lock. + * + * To reduce AIL lock traffic as much as possible, we scan the buffer log item + * list for other inodes that will run this function. We remove them from the + * buffer list so we can process all the inode IO completions in one AIL lock + * traversal. + */ +void +xfs_iflush_done( + struct xfs_buf *bp, + struct xfs_log_item *lip) +{ + struct xfs_inode_log_item *iip; + struct xfs_log_item *blip; + struct xfs_log_item *next; + struct xfs_log_item *prev; + struct xfs_ail *ailp = lip->li_ailp; + int need_ail = 0; + + /* + * Scan the buffer IO completions for other inodes being completed and + * attach them to the current inode log item. + */ + blip = bp->b_fspriv; + prev = NULL; + while (blip != NULL) { + if (blip->li_cb != xfs_iflush_done) { + prev = blip; + blip = blip->li_bio_list; + continue; + } + + /* remove from list */ + next = blip->li_bio_list; + if (!prev) { + bp->b_fspriv = next; + } else { + prev->li_bio_list = next; + } + + /* add to current list */ + blip->li_bio_list = lip->li_bio_list; + lip->li_bio_list = blip; + + /* + * while we have the item, do the unlocked check for needing + * the AIL lock. + */ + iip = INODE_ITEM(blip); + if (iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn) + need_ail++; + + blip = next; + } + + /* make sure we capture the state of the initial inode. */ + iip = INODE_ITEM(lip); + if (iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn) + need_ail++; + + /* + * We only want to pull the item from the AIL if it is + * actually there and its location in the log has not + * changed since we started the flush. Thus, we only bother + * if the ili_logged flag is set and the inode's lsn has not + * changed. First we check the lsn outside + * the lock since it's cheaper, and then we recheck while + * holding the lock before removing the inode from the AIL. + */ + if (need_ail) { + struct xfs_log_item *log_items[need_ail]; + int i = 0; + spin_lock(&ailp->xa_lock); + for (blip = lip; blip; blip = blip->li_bio_list) { + iip = INODE_ITEM(blip); + if (iip->ili_logged && + blip->li_lsn == iip->ili_flush_lsn) { + log_items[i++] = blip; + } + ASSERT(i <= need_ail); + } + /* xfs_trans_ail_delete_bulk() drops the AIL lock. */ + xfs_trans_ail_delete_bulk(ailp, log_items, i, + SHUTDOWN_CORRUPT_INCORE); + } + + + /* + * clean up and unlock the flush lock now we are done. We can clear the + * ili_last_fields bits now that we know that the data corresponding to + * them is safely on disk. + */ + for (blip = lip; blip; blip = next) { + next = blip->li_bio_list; + blip->li_bio_list = NULL; + + iip = INODE_ITEM(blip); + iip->ili_logged = 0; + iip->ili_last_fields = 0; + xfs_ifunlock(iip->ili_inode); + } +} + +/* + * This is the inode flushing abort routine. It is called from xfs_iflush when + * the filesystem is shutting down to clean up the inode state. It is + * responsible for removing the inode item from the AIL if it has not been + * re-logged, and unlocking the inode's flush lock. + */ +void +xfs_iflush_abort( + xfs_inode_t *ip, + bool stale) +{ + xfs_inode_log_item_t *iip = ip->i_itemp; + + if (iip) { + struct xfs_ail *ailp = iip->ili_item.li_ailp; + if (iip->ili_item.li_flags & XFS_LI_IN_AIL) { + spin_lock(&ailp->xa_lock); + if (iip->ili_item.li_flags & XFS_LI_IN_AIL) { + /* xfs_trans_ail_delete() drops the AIL lock. */ + xfs_trans_ail_delete(ailp, &iip->ili_item, + stale ? + SHUTDOWN_LOG_IO_ERROR : + SHUTDOWN_CORRUPT_INCORE); + } else + spin_unlock(&ailp->xa_lock); + } + iip->ili_logged = 0; + /* + * Clear the ili_last_fields bits now that we know that the + * data corresponding to them is safely on disk. + */ + iip->ili_last_fields = 0; + /* + * Clear the inode logging fields so no more flushes are + * attempted. + */ + iip->ili_fields = 0; + } + /* + * Release the inode's flush lock since we're done with it. + */ + xfs_ifunlock(ip); +} + +void +xfs_istale_done( + struct xfs_buf *bp, + struct xfs_log_item *lip) +{ + xfs_iflush_abort(INODE_ITEM(lip)->ili_inode, true); +} + +/* + * convert an xfs_inode_log_format struct from either 32 or 64 bit versions + * (which can have different field alignments) to the native version + */ +int +xfs_inode_item_format_convert( + xfs_log_iovec_t *buf, + xfs_inode_log_format_t *in_f) +{ + if (buf->i_len == sizeof(xfs_inode_log_format_32_t)) { + xfs_inode_log_format_32_t *in_f32 = buf->i_addr; + + in_f->ilf_type = in_f32->ilf_type; + in_f->ilf_size = in_f32->ilf_size; + in_f->ilf_fields = in_f32->ilf_fields; + in_f->ilf_asize = in_f32->ilf_asize; + in_f->ilf_dsize = in_f32->ilf_dsize; + in_f->ilf_ino = in_f32->ilf_ino; + /* copy biggest field of ilf_u */ + memcpy(in_f->ilf_u.ilfu_uuid.__u_bits, + in_f32->ilf_u.ilfu_uuid.__u_bits, + sizeof(uuid_t)); + in_f->ilf_blkno = in_f32->ilf_blkno; + in_f->ilf_len = in_f32->ilf_len; + in_f->ilf_boffset = in_f32->ilf_boffset; + return 0; + } else if (buf->i_len == sizeof(xfs_inode_log_format_64_t)){ + xfs_inode_log_format_64_t *in_f64 = buf->i_addr; + + in_f->ilf_type = in_f64->ilf_type; + in_f->ilf_size = in_f64->ilf_size; + in_f->ilf_fields = in_f64->ilf_fields; + in_f->ilf_asize = in_f64->ilf_asize; + in_f->ilf_dsize = in_f64->ilf_dsize; + in_f->ilf_ino = in_f64->ilf_ino; + /* copy biggest field of ilf_u */ + memcpy(in_f->ilf_u.ilfu_uuid.__u_bits, + in_f64->ilf_u.ilfu_uuid.__u_bits, + sizeof(uuid_t)); + in_f->ilf_blkno = in_f64->ilf_blkno; + in_f->ilf_len = in_f64->ilf_len; + in_f->ilf_boffset = in_f64->ilf_boffset; + return 0; + } + return -EFSCORRUPTED; +} diff --git a/kernel/fs/xfs/xfs_inode_item.h b/kernel/fs/xfs/xfs_inode_item.h new file mode 100644 index 000000000..488d81254 --- /dev/null +++ b/kernel/fs/xfs/xfs_inode_item.h @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2000,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_INODE_ITEM_H__ +#define __XFS_INODE_ITEM_H__ + +/* kernel only definitions */ + +struct xfs_buf; +struct xfs_bmbt_rec; +struct xfs_inode; +struct xfs_mount; + +typedef struct xfs_inode_log_item { + xfs_log_item_t ili_item; /* common portion */ + struct xfs_inode *ili_inode; /* inode ptr */ + xfs_lsn_t ili_flush_lsn; /* lsn at last flush */ + xfs_lsn_t ili_last_lsn; /* lsn at last transaction */ + unsigned short ili_lock_flags; /* lock flags */ + unsigned short ili_logged; /* flushed logged data */ + unsigned int ili_last_fields; /* fields when flushed */ + unsigned int ili_fields; /* fields to be logged */ +} xfs_inode_log_item_t; + +static inline int xfs_inode_clean(xfs_inode_t *ip) +{ + return !ip->i_itemp || !(ip->i_itemp->ili_fields & XFS_ILOG_ALL); +} + +extern void xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *); +extern void xfs_inode_item_destroy(struct xfs_inode *); +extern void xfs_iflush_done(struct xfs_buf *, struct xfs_log_item *); +extern void xfs_istale_done(struct xfs_buf *, struct xfs_log_item *); +extern void xfs_iflush_abort(struct xfs_inode *, bool); +extern int xfs_inode_item_format_convert(xfs_log_iovec_t *, + xfs_inode_log_format_t *); + +extern struct kmem_zone *xfs_ili_zone; + +#endif /* __XFS_INODE_ITEM_H__ */ diff --git a/kernel/fs/xfs/xfs_ioctl.c b/kernel/fs/xfs/xfs_ioctl.c new file mode 100644 index 000000000..87f67c6b6 --- /dev/null +++ b/kernel/fs/xfs/xfs_ioctl.c @@ -0,0 +1,1806 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_ioctl.h" +#include "xfs_alloc.h" +#include "xfs_rtalloc.h" +#include "xfs_itable.h" +#include "xfs_error.h" +#include "xfs_attr.h" +#include "xfs_bmap.h" +#include "xfs_bmap_util.h" +#include "xfs_fsops.h" +#include "xfs_discard.h" +#include "xfs_quota.h" +#include "xfs_export.h" +#include "xfs_trace.h" +#include "xfs_icache.h" +#include "xfs_symlink.h" +#include "xfs_trans.h" +#include "xfs_pnfs.h" + +#include +#include +#include +#include +#include +#include +#include + +/* + * xfs_find_handle maps from userspace xfs_fsop_handlereq structure to + * a file or fs handle. + * + * XFS_IOC_PATH_TO_FSHANDLE + * returns fs handle for a mount point or path within that mount point + * XFS_IOC_FD_TO_HANDLE + * returns full handle for a FD opened in user space + * XFS_IOC_PATH_TO_HANDLE + * returns full handle for a path + */ +int +xfs_find_handle( + unsigned int cmd, + xfs_fsop_handlereq_t *hreq) +{ + int hsize; + xfs_handle_t handle; + struct inode *inode; + struct fd f = {NULL}; + struct path path; + int error; + struct xfs_inode *ip; + + if (cmd == XFS_IOC_FD_TO_HANDLE) { + f = fdget(hreq->fd); + if (!f.file) + return -EBADF; + inode = file_inode(f.file); + } else { + error = user_lpath((const char __user *)hreq->path, &path); + if (error) + return error; + inode = d_inode(path.dentry); + } + ip = XFS_I(inode); + + /* + * We can only generate handles for inodes residing on a XFS filesystem, + * and only for regular files, directories or symbolic links. + */ + error = -EINVAL; + if (inode->i_sb->s_magic != XFS_SB_MAGIC) + goto out_put; + + error = -EBADF; + if (!S_ISREG(inode->i_mode) && + !S_ISDIR(inode->i_mode) && + !S_ISLNK(inode->i_mode)) + goto out_put; + + + memcpy(&handle.ha_fsid, ip->i_mount->m_fixedfsid, sizeof(xfs_fsid_t)); + + if (cmd == XFS_IOC_PATH_TO_FSHANDLE) { + /* + * This handle only contains an fsid, zero the rest. + */ + memset(&handle.ha_fid, 0, sizeof(handle.ha_fid)); + hsize = sizeof(xfs_fsid_t); + } else { + handle.ha_fid.fid_len = sizeof(xfs_fid_t) - + sizeof(handle.ha_fid.fid_len); + handle.ha_fid.fid_pad = 0; + handle.ha_fid.fid_gen = ip->i_d.di_gen; + handle.ha_fid.fid_ino = ip->i_ino; + + hsize = XFS_HSIZE(handle); + } + + error = -EFAULT; + if (copy_to_user(hreq->ohandle, &handle, hsize) || + copy_to_user(hreq->ohandlen, &hsize, sizeof(__s32))) + goto out_put; + + error = 0; + + out_put: + if (cmd == XFS_IOC_FD_TO_HANDLE) + fdput(f); + else + path_put(&path); + return error; +} + +/* + * No need to do permission checks on the various pathname components + * as the handle operations are privileged. + */ +STATIC int +xfs_handle_acceptable( + void *context, + struct dentry *dentry) +{ + return 1; +} + +/* + * Convert userspace handle data into a dentry. + */ +struct dentry * +xfs_handle_to_dentry( + struct file *parfilp, + void __user *uhandle, + u32 hlen) +{ + xfs_handle_t handle; + struct xfs_fid64 fid; + + /* + * Only allow handle opens under a directory. + */ + if (!S_ISDIR(file_inode(parfilp)->i_mode)) + return ERR_PTR(-ENOTDIR); + + if (hlen != sizeof(xfs_handle_t)) + return ERR_PTR(-EINVAL); + if (copy_from_user(&handle, uhandle, hlen)) + return ERR_PTR(-EFAULT); + if (handle.ha_fid.fid_len != + sizeof(handle.ha_fid) - sizeof(handle.ha_fid.fid_len)) + return ERR_PTR(-EINVAL); + + memset(&fid, 0, sizeof(struct fid)); + fid.ino = handle.ha_fid.fid_ino; + fid.gen = handle.ha_fid.fid_gen; + + return exportfs_decode_fh(parfilp->f_path.mnt, (struct fid *)&fid, 3, + FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG, + xfs_handle_acceptable, NULL); +} + +STATIC struct dentry * +xfs_handlereq_to_dentry( + struct file *parfilp, + xfs_fsop_handlereq_t *hreq) +{ + return xfs_handle_to_dentry(parfilp, hreq->ihandle, hreq->ihandlen); +} + +int +xfs_open_by_handle( + struct file *parfilp, + xfs_fsop_handlereq_t *hreq) +{ + const struct cred *cred = current_cred(); + int error; + int fd; + int permflag; + struct file *filp; + struct inode *inode; + struct dentry *dentry; + fmode_t fmode; + struct path path; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + dentry = xfs_handlereq_to_dentry(parfilp, hreq); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + inode = d_inode(dentry); + + /* Restrict xfs_open_by_handle to directories & regular files. */ + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) { + error = -EPERM; + goto out_dput; + } + +#if BITS_PER_LONG != 32 + hreq->oflags |= O_LARGEFILE; +#endif + + permflag = hreq->oflags; + fmode = OPEN_FMODE(permflag); + if ((!(permflag & O_APPEND) || (permflag & O_TRUNC)) && + (fmode & FMODE_WRITE) && IS_APPEND(inode)) { + error = -EPERM; + goto out_dput; + } + + if ((fmode & FMODE_WRITE) && IS_IMMUTABLE(inode)) { + error = -EACCES; + goto out_dput; + } + + /* Can't write directories. */ + if (S_ISDIR(inode->i_mode) && (fmode & FMODE_WRITE)) { + error = -EISDIR; + goto out_dput; + } + + fd = get_unused_fd_flags(0); + if (fd < 0) { + error = fd; + goto out_dput; + } + + path.mnt = parfilp->f_path.mnt; + path.dentry = dentry; + filp = dentry_open(&path, hreq->oflags, cred); + dput(dentry); + if (IS_ERR(filp)) { + put_unused_fd(fd); + return PTR_ERR(filp); + } + + if (S_ISREG(inode->i_mode)) { + filp->f_flags |= O_NOATIME; + filp->f_mode |= FMODE_NOCMTIME; + } + + fd_install(fd, filp); + return fd; + + out_dput: + dput(dentry); + return error; +} + +int +xfs_readlink_by_handle( + struct file *parfilp, + xfs_fsop_handlereq_t *hreq) +{ + struct dentry *dentry; + __u32 olen; + void *link; + int error; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + dentry = xfs_handlereq_to_dentry(parfilp, hreq); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + + /* Restrict this handle operation to symlinks only. */ + if (!d_is_symlink(dentry)) { + error = -EINVAL; + goto out_dput; + } + + if (copy_from_user(&olen, hreq->ohandlen, sizeof(__u32))) { + error = -EFAULT; + goto out_dput; + } + + link = kmalloc(MAXPATHLEN+1, GFP_KERNEL); + if (!link) { + error = -ENOMEM; + goto out_dput; + } + + error = xfs_readlink(XFS_I(d_inode(dentry)), link); + if (error) + goto out_kfree; + error = readlink_copy(hreq->ohandle, olen, link); + if (error) + goto out_kfree; + + out_kfree: + kfree(link); + out_dput: + dput(dentry); + return error; +} + +int +xfs_set_dmattrs( + xfs_inode_t *ip, + u_int evmask, + u_int16_t state) +{ + xfs_mount_t *mp = ip->i_mount; + xfs_trans_t *tp; + int error; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0); + if (error) { + xfs_trans_cancel(tp, 0); + return error; + } + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + + ip->i_d.di_dmevmask = evmask; + ip->i_d.di_dmstate = state; + + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + error = xfs_trans_commit(tp, 0); + + return error; +} + +STATIC int +xfs_fssetdm_by_handle( + struct file *parfilp, + void __user *arg) +{ + int error; + struct fsdmidata fsd; + xfs_fsop_setdm_handlereq_t dmhreq; + struct dentry *dentry; + + if (!capable(CAP_MKNOD)) + return -EPERM; + if (copy_from_user(&dmhreq, arg, sizeof(xfs_fsop_setdm_handlereq_t))) + return -EFAULT; + + error = mnt_want_write_file(parfilp); + if (error) + return error; + + dentry = xfs_handlereq_to_dentry(parfilp, &dmhreq.hreq); + if (IS_ERR(dentry)) { + mnt_drop_write_file(parfilp); + return PTR_ERR(dentry); + } + + if (IS_IMMUTABLE(d_inode(dentry)) || IS_APPEND(d_inode(dentry))) { + error = -EPERM; + goto out; + } + + if (copy_from_user(&fsd, dmhreq.data, sizeof(fsd))) { + error = -EFAULT; + goto out; + } + + error = xfs_set_dmattrs(XFS_I(d_inode(dentry)), fsd.fsd_dmevmask, + fsd.fsd_dmstate); + + out: + mnt_drop_write_file(parfilp); + dput(dentry); + return error; +} + +STATIC int +xfs_attrlist_by_handle( + struct file *parfilp, + void __user *arg) +{ + int error = -ENOMEM; + attrlist_cursor_kern_t *cursor; + xfs_fsop_attrlist_handlereq_t al_hreq; + struct dentry *dentry; + char *kbuf; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (copy_from_user(&al_hreq, arg, sizeof(xfs_fsop_attrlist_handlereq_t))) + return -EFAULT; + if (al_hreq.buflen < sizeof(struct attrlist) || + al_hreq.buflen > XATTR_LIST_MAX) + return -EINVAL; + + /* + * Reject flags, only allow namespaces. + */ + if (al_hreq.flags & ~(ATTR_ROOT | ATTR_SECURE)) + return -EINVAL; + + dentry = xfs_handlereq_to_dentry(parfilp, &al_hreq.hreq); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + + kbuf = kmem_zalloc_large(al_hreq.buflen, KM_SLEEP); + if (!kbuf) + goto out_dput; + + cursor = (attrlist_cursor_kern_t *)&al_hreq.pos; + error = xfs_attr_list(XFS_I(d_inode(dentry)), kbuf, al_hreq.buflen, + al_hreq.flags, cursor); + if (error) + goto out_kfree; + + if (copy_to_user(al_hreq.buffer, kbuf, al_hreq.buflen)) + error = -EFAULT; + +out_kfree: + kmem_free(kbuf); +out_dput: + dput(dentry); + return error; +} + +int +xfs_attrmulti_attr_get( + struct inode *inode, + unsigned char *name, + unsigned char __user *ubuf, + __uint32_t *len, + __uint32_t flags) +{ + unsigned char *kbuf; + int error = -EFAULT; + + if (*len > XATTR_SIZE_MAX) + return -EINVAL; + kbuf = kmem_zalloc_large(*len, KM_SLEEP); + if (!kbuf) + return -ENOMEM; + + error = xfs_attr_get(XFS_I(inode), name, kbuf, (int *)len, flags); + if (error) + goto out_kfree; + + if (copy_to_user(ubuf, kbuf, *len)) + error = -EFAULT; + +out_kfree: + kmem_free(kbuf); + return error; +} + +int +xfs_attrmulti_attr_set( + struct inode *inode, + unsigned char *name, + const unsigned char __user *ubuf, + __uint32_t len, + __uint32_t flags) +{ + unsigned char *kbuf; + + if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) + return -EPERM; + if (len > XATTR_SIZE_MAX) + return -EINVAL; + + kbuf = memdup_user(ubuf, len); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); + + return xfs_attr_set(XFS_I(inode), name, kbuf, len, flags); +} + +int +xfs_attrmulti_attr_remove( + struct inode *inode, + unsigned char *name, + __uint32_t flags) +{ + if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) + return -EPERM; + return xfs_attr_remove(XFS_I(inode), name, flags); +} + +STATIC int +xfs_attrmulti_by_handle( + struct file *parfilp, + void __user *arg) +{ + int error; + xfs_attr_multiop_t *ops; + xfs_fsop_attrmulti_handlereq_t am_hreq; + struct dentry *dentry; + unsigned int i, size; + unsigned char *attr_name; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (copy_from_user(&am_hreq, arg, sizeof(xfs_fsop_attrmulti_handlereq_t))) + return -EFAULT; + + /* overflow check */ + if (am_hreq.opcount >= INT_MAX / sizeof(xfs_attr_multiop_t)) + return -E2BIG; + + dentry = xfs_handlereq_to_dentry(parfilp, &am_hreq.hreq); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + + error = -E2BIG; + size = am_hreq.opcount * sizeof(xfs_attr_multiop_t); + if (!size || size > 16 * PAGE_SIZE) + goto out_dput; + + ops = memdup_user(am_hreq.ops, size); + if (IS_ERR(ops)) { + error = PTR_ERR(ops); + goto out_dput; + } + + error = -ENOMEM; + attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL); + if (!attr_name) + goto out_kfree_ops; + + error = 0; + for (i = 0; i < am_hreq.opcount; i++) { + ops[i].am_error = strncpy_from_user((char *)attr_name, + ops[i].am_attrname, MAXNAMELEN); + if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN) + error = -ERANGE; + if (ops[i].am_error < 0) + break; + + switch (ops[i].am_opcode) { + case ATTR_OP_GET: + ops[i].am_error = xfs_attrmulti_attr_get( + d_inode(dentry), attr_name, + ops[i].am_attrvalue, &ops[i].am_length, + ops[i].am_flags); + break; + case ATTR_OP_SET: + ops[i].am_error = mnt_want_write_file(parfilp); + if (ops[i].am_error) + break; + ops[i].am_error = xfs_attrmulti_attr_set( + d_inode(dentry), attr_name, + ops[i].am_attrvalue, ops[i].am_length, + ops[i].am_flags); + mnt_drop_write_file(parfilp); + break; + case ATTR_OP_REMOVE: + ops[i].am_error = mnt_want_write_file(parfilp); + if (ops[i].am_error) + break; + ops[i].am_error = xfs_attrmulti_attr_remove( + d_inode(dentry), attr_name, + ops[i].am_flags); + mnt_drop_write_file(parfilp); + break; + default: + ops[i].am_error = -EINVAL; + } + } + + if (copy_to_user(am_hreq.ops, ops, size)) + error = -EFAULT; + + kfree(attr_name); + out_kfree_ops: + kfree(ops); + out_dput: + dput(dentry); + return error; +} + +int +xfs_ioc_space( + struct xfs_inode *ip, + struct inode *inode, + struct file *filp, + int ioflags, + unsigned int cmd, + xfs_flock64_t *bf) +{ + struct iattr iattr; + enum xfs_prealloc_flags flags = 0; + uint iolock = XFS_IOLOCK_EXCL; + int error; + + /* + * Only allow the sys admin to reserve space unless + * unwritten extents are enabled. + */ + if (!xfs_sb_version_hasextflgbit(&ip->i_mount->m_sb) && + !capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) + return -EPERM; + + if (!(filp->f_mode & FMODE_WRITE)) + return -EBADF; + + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + + if (filp->f_flags & O_DSYNC) + flags |= XFS_PREALLOC_SYNC; + if (ioflags & XFS_IO_INVIS) + flags |= XFS_PREALLOC_INVISIBLE; + + error = mnt_want_write_file(filp); + if (error) + return error; + + xfs_ilock(ip, iolock); + error = xfs_break_layouts(inode, &iolock, false); + if (error) + goto out_unlock; + + xfs_ilock(ip, XFS_MMAPLOCK_EXCL); + iolock |= XFS_MMAPLOCK_EXCL; + + switch (bf->l_whence) { + case 0: /*SEEK_SET*/ + break; + case 1: /*SEEK_CUR*/ + bf->l_start += filp->f_pos; + break; + case 2: /*SEEK_END*/ + bf->l_start += XFS_ISIZE(ip); + break; + default: + error = -EINVAL; + goto out_unlock; + } + + /* + * length of <= 0 for resv/unresv/zero is invalid. length for + * alloc/free is ignored completely and we have no idea what userspace + * might have set it to, so set it to zero to allow range + * checks to pass. + */ + switch (cmd) { + case XFS_IOC_ZERO_RANGE: + case XFS_IOC_RESVSP: + case XFS_IOC_RESVSP64: + case XFS_IOC_UNRESVSP: + case XFS_IOC_UNRESVSP64: + if (bf->l_len <= 0) { + error = -EINVAL; + goto out_unlock; + } + break; + default: + bf->l_len = 0; + break; + } + + if (bf->l_start < 0 || + bf->l_start > inode->i_sb->s_maxbytes || + bf->l_start + bf->l_len < 0 || + bf->l_start + bf->l_len >= inode->i_sb->s_maxbytes) { + error = -EINVAL; + goto out_unlock; + } + + switch (cmd) { + case XFS_IOC_ZERO_RANGE: + flags |= XFS_PREALLOC_SET; + error = xfs_zero_file_space(ip, bf->l_start, bf->l_len); + break; + case XFS_IOC_RESVSP: + case XFS_IOC_RESVSP64: + flags |= XFS_PREALLOC_SET; + error = xfs_alloc_file_space(ip, bf->l_start, bf->l_len, + XFS_BMAPI_PREALLOC); + break; + case XFS_IOC_UNRESVSP: + case XFS_IOC_UNRESVSP64: + error = xfs_free_file_space(ip, bf->l_start, bf->l_len); + break; + case XFS_IOC_ALLOCSP: + case XFS_IOC_ALLOCSP64: + case XFS_IOC_FREESP: + case XFS_IOC_FREESP64: + flags |= XFS_PREALLOC_CLEAR; + if (bf->l_start > XFS_ISIZE(ip)) { + error = xfs_alloc_file_space(ip, XFS_ISIZE(ip), + bf->l_start - XFS_ISIZE(ip), 0); + if (error) + goto out_unlock; + } + + iattr.ia_valid = ATTR_SIZE; + iattr.ia_size = bf->l_start; + + error = xfs_setattr_size(ip, &iattr); + break; + default: + ASSERT(0); + error = -EINVAL; + } + + if (error) + goto out_unlock; + + error = xfs_update_prealloc_flags(ip, flags); + +out_unlock: + xfs_iunlock(ip, iolock); + mnt_drop_write_file(filp); + return error; +} + +STATIC int +xfs_ioc_bulkstat( + xfs_mount_t *mp, + unsigned int cmd, + void __user *arg) +{ + xfs_fsop_bulkreq_t bulkreq; + int count; /* # of records returned */ + xfs_ino_t inlast; /* last inode number */ + int done; + int error; + + /* done = 1 if there are more stats to get and if bulkstat */ + /* should be called again (unused here, but used in dmapi) */ + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + if (copy_from_user(&bulkreq, arg, sizeof(xfs_fsop_bulkreq_t))) + return -EFAULT; + + if (copy_from_user(&inlast, bulkreq.lastip, sizeof(__s64))) + return -EFAULT; + + if ((count = bulkreq.icount) <= 0) + return -EINVAL; + + if (bulkreq.ubuffer == NULL) + return -EINVAL; + + if (cmd == XFS_IOC_FSINUMBERS) + error = xfs_inumbers(mp, &inlast, &count, + bulkreq.ubuffer, xfs_inumbers_fmt); + else if (cmd == XFS_IOC_FSBULKSTAT_SINGLE) + error = xfs_bulkstat_one(mp, inlast, bulkreq.ubuffer, + sizeof(xfs_bstat_t), NULL, &done); + else /* XFS_IOC_FSBULKSTAT */ + error = xfs_bulkstat(mp, &inlast, &count, xfs_bulkstat_one, + sizeof(xfs_bstat_t), bulkreq.ubuffer, + &done); + + if (error) + return error; + + if (bulkreq.ocount != NULL) { + if (copy_to_user(bulkreq.lastip, &inlast, + sizeof(xfs_ino_t))) + return -EFAULT; + + if (copy_to_user(bulkreq.ocount, &count, sizeof(count))) + return -EFAULT; + } + + return 0; +} + +STATIC int +xfs_ioc_fsgeometry_v1( + xfs_mount_t *mp, + void __user *arg) +{ + xfs_fsop_geom_t fsgeo; + int error; + + error = xfs_fs_geometry(mp, &fsgeo, 3); + if (error) + return error; + + /* + * Caller should have passed an argument of type + * xfs_fsop_geom_v1_t. This is a proper subset of the + * xfs_fsop_geom_t that xfs_fs_geometry() fills in. + */ + if (copy_to_user(arg, &fsgeo, sizeof(xfs_fsop_geom_v1_t))) + return -EFAULT; + return 0; +} + +STATIC int +xfs_ioc_fsgeometry( + xfs_mount_t *mp, + void __user *arg) +{ + xfs_fsop_geom_t fsgeo; + int error; + + error = xfs_fs_geometry(mp, &fsgeo, 4); + if (error) + return error; + + if (copy_to_user(arg, &fsgeo, sizeof(fsgeo))) + return -EFAULT; + return 0; +} + +/* + * Linux extended inode flags interface. + */ + +STATIC unsigned int +xfs_merge_ioc_xflags( + unsigned int flags, + unsigned int start) +{ + unsigned int xflags = start; + + if (flags & FS_IMMUTABLE_FL) + xflags |= XFS_XFLAG_IMMUTABLE; + else + xflags &= ~XFS_XFLAG_IMMUTABLE; + if (flags & FS_APPEND_FL) + xflags |= XFS_XFLAG_APPEND; + else + xflags &= ~XFS_XFLAG_APPEND; + if (flags & FS_SYNC_FL) + xflags |= XFS_XFLAG_SYNC; + else + xflags &= ~XFS_XFLAG_SYNC; + if (flags & FS_NOATIME_FL) + xflags |= XFS_XFLAG_NOATIME; + else + xflags &= ~XFS_XFLAG_NOATIME; + if (flags & FS_NODUMP_FL) + xflags |= XFS_XFLAG_NODUMP; + else + xflags &= ~XFS_XFLAG_NODUMP; + + return xflags; +} + +STATIC unsigned int +xfs_di2lxflags( + __uint16_t di_flags) +{ + unsigned int flags = 0; + + if (di_flags & XFS_DIFLAG_IMMUTABLE) + flags |= FS_IMMUTABLE_FL; + if (di_flags & XFS_DIFLAG_APPEND) + flags |= FS_APPEND_FL; + if (di_flags & XFS_DIFLAG_SYNC) + flags |= FS_SYNC_FL; + if (di_flags & XFS_DIFLAG_NOATIME) + flags |= FS_NOATIME_FL; + if (di_flags & XFS_DIFLAG_NODUMP) + flags |= FS_NODUMP_FL; + return flags; +} + +STATIC int +xfs_ioc_fsgetxattr( + xfs_inode_t *ip, + int attr, + void __user *arg) +{ + struct fsxattr fa; + + memset(&fa, 0, sizeof(struct fsxattr)); + + xfs_ilock(ip, XFS_ILOCK_SHARED); + fa.fsx_xflags = xfs_ip2xflags(ip); + fa.fsx_extsize = ip->i_d.di_extsize << ip->i_mount->m_sb.sb_blocklog; + fa.fsx_projid = xfs_get_projid(ip); + + if (attr) { + if (ip->i_afp) { + if (ip->i_afp->if_flags & XFS_IFEXTENTS) + fa.fsx_nextents = ip->i_afp->if_bytes / + sizeof(xfs_bmbt_rec_t); + else + fa.fsx_nextents = ip->i_d.di_anextents; + } else + fa.fsx_nextents = 0; + } else { + if (ip->i_df.if_flags & XFS_IFEXTENTS) + fa.fsx_nextents = ip->i_df.if_bytes / + sizeof(xfs_bmbt_rec_t); + else + fa.fsx_nextents = ip->i_d.di_nextents; + } + xfs_iunlock(ip, XFS_ILOCK_SHARED); + + if (copy_to_user(arg, &fa, sizeof(fa))) + return -EFAULT; + return 0; +} + +STATIC void +xfs_set_diflags( + struct xfs_inode *ip, + unsigned int xflags) +{ + unsigned int di_flags; + + /* can't set PREALLOC this way, just preserve it */ + di_flags = (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC); + if (xflags & XFS_XFLAG_IMMUTABLE) + di_flags |= XFS_DIFLAG_IMMUTABLE; + if (xflags & XFS_XFLAG_APPEND) + di_flags |= XFS_DIFLAG_APPEND; + if (xflags & XFS_XFLAG_SYNC) + di_flags |= XFS_DIFLAG_SYNC; + if (xflags & XFS_XFLAG_NOATIME) + di_flags |= XFS_DIFLAG_NOATIME; + if (xflags & XFS_XFLAG_NODUMP) + di_flags |= XFS_DIFLAG_NODUMP; + if (xflags & XFS_XFLAG_NODEFRAG) + di_flags |= XFS_DIFLAG_NODEFRAG; + if (xflags & XFS_XFLAG_FILESTREAM) + di_flags |= XFS_DIFLAG_FILESTREAM; + if (S_ISDIR(ip->i_d.di_mode)) { + if (xflags & XFS_XFLAG_RTINHERIT) + di_flags |= XFS_DIFLAG_RTINHERIT; + if (xflags & XFS_XFLAG_NOSYMLINKS) + di_flags |= XFS_DIFLAG_NOSYMLINKS; + if (xflags & XFS_XFLAG_EXTSZINHERIT) + di_flags |= XFS_DIFLAG_EXTSZINHERIT; + if (xflags & XFS_XFLAG_PROJINHERIT) + di_flags |= XFS_DIFLAG_PROJINHERIT; + } else if (S_ISREG(ip->i_d.di_mode)) { + if (xflags & XFS_XFLAG_REALTIME) + di_flags |= XFS_DIFLAG_REALTIME; + if (xflags & XFS_XFLAG_EXTSIZE) + di_flags |= XFS_DIFLAG_EXTSIZE; + } + + ip->i_d.di_flags = di_flags; +} + +STATIC void +xfs_diflags_to_linux( + struct xfs_inode *ip) +{ + struct inode *inode = VFS_I(ip); + unsigned int xflags = xfs_ip2xflags(ip); + + if (xflags & XFS_XFLAG_IMMUTABLE) + inode->i_flags |= S_IMMUTABLE; + else + inode->i_flags &= ~S_IMMUTABLE; + if (xflags & XFS_XFLAG_APPEND) + inode->i_flags |= S_APPEND; + else + inode->i_flags &= ~S_APPEND; + if (xflags & XFS_XFLAG_SYNC) + inode->i_flags |= S_SYNC; + else + inode->i_flags &= ~S_SYNC; + if (xflags & XFS_XFLAG_NOATIME) + inode->i_flags |= S_NOATIME; + else + inode->i_flags &= ~S_NOATIME; +} + +static int +xfs_ioctl_setattr_xflags( + struct xfs_trans *tp, + struct xfs_inode *ip, + struct fsxattr *fa) +{ + struct xfs_mount *mp = ip->i_mount; + + /* Can't change realtime flag if any extents are allocated. */ + if ((ip->i_d.di_nextents || ip->i_delayed_blks) && + XFS_IS_REALTIME_INODE(ip) != (fa->fsx_xflags & XFS_XFLAG_REALTIME)) + return -EINVAL; + + /* If realtime flag is set then must have realtime device */ + if (fa->fsx_xflags & XFS_XFLAG_REALTIME) { + if (mp->m_sb.sb_rblocks == 0 || mp->m_sb.sb_rextsize == 0 || + (ip->i_d.di_extsize % mp->m_sb.sb_rextsize)) + return -EINVAL; + } + + /* + * Can't modify an immutable/append-only file unless + * we have appropriate permission. + */ + if (((ip->i_d.di_flags & (XFS_DIFLAG_IMMUTABLE | XFS_DIFLAG_APPEND)) || + (fa->fsx_xflags & (XFS_XFLAG_IMMUTABLE | XFS_XFLAG_APPEND))) && + !capable(CAP_LINUX_IMMUTABLE)) + return -EPERM; + + xfs_set_diflags(ip, fa->fsx_xflags); + xfs_diflags_to_linux(ip); + xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + XFS_STATS_INC(xs_ig_attrchg); + return 0; +} + +/* + * Set up the transaction structure for the setattr operation, checking that we + * have permission to do so. On success, return a clean transaction and the + * inode locked exclusively ready for further operation specific checks. On + * failure, return an error without modifying or locking the inode. + */ +static struct xfs_trans * +xfs_ioctl_setattr_get_trans( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + int error; + + if (mp->m_flags & XFS_MOUNT_RDONLY) + return ERR_PTR(-EROFS); + if (XFS_FORCED_SHUTDOWN(mp)) + return ERR_PTR(-EIO); + + tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0); + if (error) + goto out_cancel; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + + /* + * CAP_FOWNER overrides the following restrictions: + * + * The user ID of the calling process must be equal to the file owner + * ID, except in cases where the CAP_FSETID capability is applicable. + */ + if (!inode_owner_or_capable(VFS_I(ip))) { + error = -EPERM; + goto out_cancel; + } + + if (mp->m_flags & XFS_MOUNT_WSYNC) + xfs_trans_set_sync(tp); + + return tp; + +out_cancel: + xfs_trans_cancel(tp, 0); + return ERR_PTR(error); +} + +/* + * extent size hint validation is somewhat cumbersome. Rules are: + * + * 1. extent size hint is only valid for directories and regular files + * 2. XFS_XFLAG_EXTSIZE is only valid for regular files + * 3. XFS_XFLAG_EXTSZINHERIT is only valid for directories. + * 4. can only be changed on regular files if no extents are allocated + * 5. can be changed on directories at any time + * 6. extsize hint of 0 turns off hints, clears inode flags. + * 7. Extent size must be a multiple of the appropriate block size. + * 8. for non-realtime files, the extent size hint must be limited + * to half the AG size to avoid alignment extending the extent beyond the + * limits of the AG. + */ +static int +xfs_ioctl_setattr_check_extsize( + struct xfs_inode *ip, + struct fsxattr *fa) +{ + struct xfs_mount *mp = ip->i_mount; + + if ((fa->fsx_xflags & XFS_XFLAG_EXTSIZE) && !S_ISREG(ip->i_d.di_mode)) + return -EINVAL; + + if ((fa->fsx_xflags & XFS_XFLAG_EXTSZINHERIT) && + !S_ISDIR(ip->i_d.di_mode)) + return -EINVAL; + + if (S_ISREG(ip->i_d.di_mode) && ip->i_d.di_nextents && + ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) != fa->fsx_extsize)) + return -EINVAL; + + if (fa->fsx_extsize != 0) { + xfs_extlen_t size; + xfs_fsblock_t extsize_fsb; + + extsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_extsize); + if (extsize_fsb > MAXEXTLEN) + return -EINVAL; + + if (XFS_IS_REALTIME_INODE(ip) || + (fa->fsx_xflags & XFS_XFLAG_REALTIME)) { + size = mp->m_sb.sb_rextsize << mp->m_sb.sb_blocklog; + } else { + size = mp->m_sb.sb_blocksize; + if (extsize_fsb > mp->m_sb.sb_agblocks / 2) + return -EINVAL; + } + + if (fa->fsx_extsize % size) + return -EINVAL; + } else + fa->fsx_xflags &= ~(XFS_XFLAG_EXTSIZE | XFS_XFLAG_EXTSZINHERIT); + + return 0; +} + +static int +xfs_ioctl_setattr_check_projid( + struct xfs_inode *ip, + struct fsxattr *fa) +{ + /* Disallow 32bit project ids if projid32bit feature is not enabled. */ + if (fa->fsx_projid > (__uint16_t)-1 && + !xfs_sb_version_hasprojid32bit(&ip->i_mount->m_sb)) + return -EINVAL; + + /* + * Project Quota ID state is only allowed to change from within the init + * namespace. Enforce that restriction only if we are trying to change + * the quota ID state. Everything else is allowed in user namespaces. + */ + if (current_user_ns() == &init_user_ns) + return 0; + + if (xfs_get_projid(ip) != fa->fsx_projid) + return -EINVAL; + if ((fa->fsx_xflags & XFS_XFLAG_PROJINHERIT) != + (ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)) + return -EINVAL; + + return 0; +} + +STATIC int +xfs_ioctl_setattr( + xfs_inode_t *ip, + struct fsxattr *fa) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + struct xfs_dquot *udqp = NULL; + struct xfs_dquot *pdqp = NULL; + struct xfs_dquot *olddquot = NULL; + int code; + + trace_xfs_ioctl_setattr(ip); + + code = xfs_ioctl_setattr_check_projid(ip, fa); + if (code) + return code; + + /* + * If disk quotas is on, we make sure that the dquots do exist on disk, + * before we start any other transactions. Trying to do this later + * is messy. We don't care to take a readlock to look at the ids + * in inode here, because we can't hold it across the trans_reserve. + * If the IDs do change before we take the ilock, we're covered + * because the i_*dquot fields will get updated anyway. + */ + if (XFS_IS_QUOTA_ON(mp)) { + code = xfs_qm_vop_dqalloc(ip, ip->i_d.di_uid, + ip->i_d.di_gid, fa->fsx_projid, + XFS_QMOPT_PQUOTA, &udqp, NULL, &pdqp); + if (code) + return code; + } + + tp = xfs_ioctl_setattr_get_trans(ip); + if (IS_ERR(tp)) { + code = PTR_ERR(tp); + goto error_free_dquots; + } + + + if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp) && + xfs_get_projid(ip) != fa->fsx_projid) { + code = xfs_qm_vop_chown_reserve(tp, ip, udqp, NULL, pdqp, + capable(CAP_FOWNER) ? XFS_QMOPT_FORCE_RES : 0); + if (code) /* out of quota */ + goto error_trans_cancel; + } + + code = xfs_ioctl_setattr_check_extsize(ip, fa); + if (code) + goto error_trans_cancel; + + code = xfs_ioctl_setattr_xflags(tp, ip, fa); + if (code) + goto error_trans_cancel; + + /* + * Change file ownership. Must be the owner or privileged. CAP_FSETID + * overrides the following restrictions: + * + * The set-user-ID and set-group-ID bits of a file will be cleared upon + * successful return from chown() + */ + + if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) && + !capable_wrt_inode_uidgid(VFS_I(ip), CAP_FSETID)) + ip->i_d.di_mode &= ~(S_ISUID|S_ISGID); + + /* Change the ownerships and register project quota modifications */ + if (xfs_get_projid(ip) != fa->fsx_projid) { + if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp)) { + olddquot = xfs_qm_vop_chown(tp, ip, + &ip->i_pdquot, pdqp); + } + ASSERT(ip->i_d.di_version > 1); + xfs_set_projid(ip, fa->fsx_projid); + } + + /* + * Only set the extent size hint if we've already determined that the + * extent size hint should be set on the inode. If no extent size flags + * are set on the inode then unconditionally clear the extent size hint. + */ + if (ip->i_d.di_flags & (XFS_DIFLAG_EXTSIZE | XFS_DIFLAG_EXTSZINHERIT)) + ip->i_d.di_extsize = fa->fsx_extsize >> mp->m_sb.sb_blocklog; + else + ip->i_d.di_extsize = 0; + + code = xfs_trans_commit(tp, 0); + + /* + * Release any dquot(s) the inode had kept before chown. + */ + xfs_qm_dqrele(olddquot); + xfs_qm_dqrele(udqp); + xfs_qm_dqrele(pdqp); + + return code; + +error_trans_cancel: + xfs_trans_cancel(tp, 0); +error_free_dquots: + xfs_qm_dqrele(udqp); + xfs_qm_dqrele(pdqp); + return code; +} + +STATIC int +xfs_ioc_fssetxattr( + xfs_inode_t *ip, + struct file *filp, + void __user *arg) +{ + struct fsxattr fa; + int error; + + if (copy_from_user(&fa, arg, sizeof(fa))) + return -EFAULT; + + error = mnt_want_write_file(filp); + if (error) + return error; + error = xfs_ioctl_setattr(ip, &fa); + mnt_drop_write_file(filp); + return error; +} + +STATIC int +xfs_ioc_getxflags( + xfs_inode_t *ip, + void __user *arg) +{ + unsigned int flags; + + flags = xfs_di2lxflags(ip->i_d.di_flags); + if (copy_to_user(arg, &flags, sizeof(flags))) + return -EFAULT; + return 0; +} + +STATIC int +xfs_ioc_setxflags( + struct xfs_inode *ip, + struct file *filp, + void __user *arg) +{ + struct xfs_trans *tp; + struct fsxattr fa; + unsigned int flags; + int error; + + if (copy_from_user(&flags, arg, sizeof(flags))) + return -EFAULT; + + if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \ + FS_NOATIME_FL | FS_NODUMP_FL | \ + FS_SYNC_FL)) + return -EOPNOTSUPP; + + fa.fsx_xflags = xfs_merge_ioc_xflags(flags, xfs_ip2xflags(ip)); + + error = mnt_want_write_file(filp); + if (error) + return error; + + tp = xfs_ioctl_setattr_get_trans(ip); + if (IS_ERR(tp)) { + error = PTR_ERR(tp); + goto out_drop_write; + } + + error = xfs_ioctl_setattr_xflags(tp, ip, &fa); + if (error) { + xfs_trans_cancel(tp, 0); + goto out_drop_write; + } + + error = xfs_trans_commit(tp, 0); +out_drop_write: + mnt_drop_write_file(filp); + return error; +} + +STATIC int +xfs_getbmap_format(void **ap, struct getbmapx *bmv, int *full) +{ + struct getbmap __user *base = (struct getbmap __user *)*ap; + + /* copy only getbmap portion (not getbmapx) */ + if (copy_to_user(base, bmv, sizeof(struct getbmap))) + return -EFAULT; + + *ap += sizeof(struct getbmap); + return 0; +} + +STATIC int +xfs_ioc_getbmap( + struct xfs_inode *ip, + int ioflags, + unsigned int cmd, + void __user *arg) +{ + struct getbmapx bmx; + int error; + + if (copy_from_user(&bmx, arg, sizeof(struct getbmapx))) + return -EFAULT; + + if (bmx.bmv_count < 2) + return -EINVAL; + + bmx.bmv_iflags = (cmd == XFS_IOC_GETBMAPA ? BMV_IF_ATTRFORK : 0); + if (ioflags & XFS_IO_INVIS) + bmx.bmv_iflags |= BMV_IF_NO_DMAPI_READ; + + error = xfs_getbmap(ip, &bmx, xfs_getbmap_format, + (__force struct getbmap *)arg+1); + if (error) + return error; + + /* copy back header - only size of getbmap */ + if (copy_to_user(arg, &bmx, sizeof(struct getbmap))) + return -EFAULT; + return 0; +} + +STATIC int +xfs_getbmapx_format(void **ap, struct getbmapx *bmv, int *full) +{ + struct getbmapx __user *base = (struct getbmapx __user *)*ap; + + if (copy_to_user(base, bmv, sizeof(struct getbmapx))) + return -EFAULT; + + *ap += sizeof(struct getbmapx); + return 0; +} + +STATIC int +xfs_ioc_getbmapx( + struct xfs_inode *ip, + void __user *arg) +{ + struct getbmapx bmx; + int error; + + if (copy_from_user(&bmx, arg, sizeof(bmx))) + return -EFAULT; + + if (bmx.bmv_count < 2) + return -EINVAL; + + if (bmx.bmv_iflags & (~BMV_IF_VALID)) + return -EINVAL; + + error = xfs_getbmap(ip, &bmx, xfs_getbmapx_format, + (__force struct getbmapx *)arg+1); + if (error) + return error; + + /* copy back header */ + if (copy_to_user(arg, &bmx, sizeof(struct getbmapx))) + return -EFAULT; + + return 0; +} + +int +xfs_ioc_swapext( + xfs_swapext_t *sxp) +{ + xfs_inode_t *ip, *tip; + struct fd f, tmp; + int error = 0; + + /* Pull information for the target fd */ + f = fdget((int)sxp->sx_fdtarget); + if (!f.file) { + error = -EINVAL; + goto out; + } + + if (!(f.file->f_mode & FMODE_WRITE) || + !(f.file->f_mode & FMODE_READ) || + (f.file->f_flags & O_APPEND)) { + error = -EBADF; + goto out_put_file; + } + + tmp = fdget((int)sxp->sx_fdtmp); + if (!tmp.file) { + error = -EINVAL; + goto out_put_file; + } + + if (!(tmp.file->f_mode & FMODE_WRITE) || + !(tmp.file->f_mode & FMODE_READ) || + (tmp.file->f_flags & O_APPEND)) { + error = -EBADF; + goto out_put_tmp_file; + } + + if (IS_SWAPFILE(file_inode(f.file)) || + IS_SWAPFILE(file_inode(tmp.file))) { + error = -EINVAL; + goto out_put_tmp_file; + } + + ip = XFS_I(file_inode(f.file)); + tip = XFS_I(file_inode(tmp.file)); + + if (ip->i_mount != tip->i_mount) { + error = -EINVAL; + goto out_put_tmp_file; + } + + if (ip->i_ino == tip->i_ino) { + error = -EINVAL; + goto out_put_tmp_file; + } + + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { + error = -EIO; + goto out_put_tmp_file; + } + + error = xfs_swap_extents(ip, tip, sxp); + + out_put_tmp_file: + fdput(tmp); + out_put_file: + fdput(f); + out: + return error; +} + +/* + * Note: some of the ioctl's return positive numbers as a + * byte count indicating success, such as readlink_by_handle. + * So we don't "sign flip" like most other routines. This means + * true errors need to be returned as a negative value. + */ +long +xfs_file_ioctl( + struct file *filp, + unsigned int cmd, + unsigned long p) +{ + struct inode *inode = file_inode(filp); + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + void __user *arg = (void __user *)p; + int ioflags = 0; + int error; + + if (filp->f_mode & FMODE_NOCMTIME) + ioflags |= XFS_IO_INVIS; + + trace_xfs_file_ioctl(ip); + + switch (cmd) { + case FITRIM: + return xfs_ioc_trim(mp, arg); + case XFS_IOC_ALLOCSP: + case XFS_IOC_FREESP: + case XFS_IOC_RESVSP: + case XFS_IOC_UNRESVSP: + case XFS_IOC_ALLOCSP64: + case XFS_IOC_FREESP64: + case XFS_IOC_RESVSP64: + case XFS_IOC_UNRESVSP64: + case XFS_IOC_ZERO_RANGE: { + xfs_flock64_t bf; + + if (copy_from_user(&bf, arg, sizeof(bf))) + return -EFAULT; + return xfs_ioc_space(ip, inode, filp, ioflags, cmd, &bf); + } + case XFS_IOC_DIOINFO: { + struct dioattr da; + xfs_buftarg_t *target = + XFS_IS_REALTIME_INODE(ip) ? + mp->m_rtdev_targp : mp->m_ddev_targp; + + da.d_mem = da.d_miniosz = target->bt_logical_sectorsize; + da.d_maxiosz = INT_MAX & ~(da.d_miniosz - 1); + + if (copy_to_user(arg, &da, sizeof(da))) + return -EFAULT; + return 0; + } + + case XFS_IOC_FSBULKSTAT_SINGLE: + case XFS_IOC_FSBULKSTAT: + case XFS_IOC_FSINUMBERS: + return xfs_ioc_bulkstat(mp, cmd, arg); + + case XFS_IOC_FSGEOMETRY_V1: + return xfs_ioc_fsgeometry_v1(mp, arg); + + case XFS_IOC_FSGEOMETRY: + return xfs_ioc_fsgeometry(mp, arg); + + case XFS_IOC_GETVERSION: + return put_user(inode->i_generation, (int __user *)arg); + + case XFS_IOC_FSGETXATTR: + return xfs_ioc_fsgetxattr(ip, 0, arg); + case XFS_IOC_FSGETXATTRA: + return xfs_ioc_fsgetxattr(ip, 1, arg); + case XFS_IOC_FSSETXATTR: + return xfs_ioc_fssetxattr(ip, filp, arg); + case XFS_IOC_GETXFLAGS: + return xfs_ioc_getxflags(ip, arg); + case XFS_IOC_SETXFLAGS: + return xfs_ioc_setxflags(ip, filp, arg); + + case XFS_IOC_FSSETDM: { + struct fsdmidata dmi; + + if (copy_from_user(&dmi, arg, sizeof(dmi))) + return -EFAULT; + + error = mnt_want_write_file(filp); + if (error) + return error; + + error = xfs_set_dmattrs(ip, dmi.fsd_dmevmask, + dmi.fsd_dmstate); + mnt_drop_write_file(filp); + return error; + } + + case XFS_IOC_GETBMAP: + case XFS_IOC_GETBMAPA: + return xfs_ioc_getbmap(ip, ioflags, cmd, arg); + + case XFS_IOC_GETBMAPX: + return xfs_ioc_getbmapx(ip, arg); + + case XFS_IOC_FD_TO_HANDLE: + case XFS_IOC_PATH_TO_HANDLE: + case XFS_IOC_PATH_TO_FSHANDLE: { + xfs_fsop_handlereq_t hreq; + + if (copy_from_user(&hreq, arg, sizeof(hreq))) + return -EFAULT; + return xfs_find_handle(cmd, &hreq); + } + case XFS_IOC_OPEN_BY_HANDLE: { + xfs_fsop_handlereq_t hreq; + + if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t))) + return -EFAULT; + return xfs_open_by_handle(filp, &hreq); + } + case XFS_IOC_FSSETDM_BY_HANDLE: + return xfs_fssetdm_by_handle(filp, arg); + + case XFS_IOC_READLINK_BY_HANDLE: { + xfs_fsop_handlereq_t hreq; + + if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t))) + return -EFAULT; + return xfs_readlink_by_handle(filp, &hreq); + } + case XFS_IOC_ATTRLIST_BY_HANDLE: + return xfs_attrlist_by_handle(filp, arg); + + case XFS_IOC_ATTRMULTI_BY_HANDLE: + return xfs_attrmulti_by_handle(filp, arg); + + case XFS_IOC_SWAPEXT: { + struct xfs_swapext sxp; + + if (copy_from_user(&sxp, arg, sizeof(xfs_swapext_t))) + return -EFAULT; + error = mnt_want_write_file(filp); + if (error) + return error; + error = xfs_ioc_swapext(&sxp); + mnt_drop_write_file(filp); + return error; + } + + case XFS_IOC_FSCOUNTS: { + xfs_fsop_counts_t out; + + error = xfs_fs_counts(mp, &out); + if (error) + return error; + + if (copy_to_user(arg, &out, sizeof(out))) + return -EFAULT; + return 0; + } + + case XFS_IOC_SET_RESBLKS: { + xfs_fsop_resblks_t inout; + __uint64_t in; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (mp->m_flags & XFS_MOUNT_RDONLY) + return -EROFS; + + if (copy_from_user(&inout, arg, sizeof(inout))) + return -EFAULT; + + error = mnt_want_write_file(filp); + if (error) + return error; + + /* input parameter is passed in resblks field of structure */ + in = inout.resblks; + error = xfs_reserve_blocks(mp, &in, &inout); + mnt_drop_write_file(filp); + if (error) + return error; + + if (copy_to_user(arg, &inout, sizeof(inout))) + return -EFAULT; + return 0; + } + + case XFS_IOC_GET_RESBLKS: { + xfs_fsop_resblks_t out; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + error = xfs_reserve_blocks(mp, NULL, &out); + if (error) + return error; + + if (copy_to_user(arg, &out, sizeof(out))) + return -EFAULT; + + return 0; + } + + case XFS_IOC_FSGROWFSDATA: { + xfs_growfs_data_t in; + + if (copy_from_user(&in, arg, sizeof(in))) + return -EFAULT; + + error = mnt_want_write_file(filp); + if (error) + return error; + error = xfs_growfs_data(mp, &in); + mnt_drop_write_file(filp); + return error; + } + + case XFS_IOC_FSGROWFSLOG: { + xfs_growfs_log_t in; + + if (copy_from_user(&in, arg, sizeof(in))) + return -EFAULT; + + error = mnt_want_write_file(filp); + if (error) + return error; + error = xfs_growfs_log(mp, &in); + mnt_drop_write_file(filp); + return error; + } + + case XFS_IOC_FSGROWFSRT: { + xfs_growfs_rt_t in; + + if (copy_from_user(&in, arg, sizeof(in))) + return -EFAULT; + + error = mnt_want_write_file(filp); + if (error) + return error; + error = xfs_growfs_rt(mp, &in); + mnt_drop_write_file(filp); + return error; + } + + case XFS_IOC_GOINGDOWN: { + __uint32_t in; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (get_user(in, (__uint32_t __user *)arg)) + return -EFAULT; + + return xfs_fs_goingdown(mp, in); + } + + case XFS_IOC_ERROR_INJECTION: { + xfs_error_injection_t in; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(&in, arg, sizeof(in))) + return -EFAULT; + + return xfs_errortag_add(in.errtag, mp); + } + + case XFS_IOC_ERROR_CLEARALL: + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + return xfs_errortag_clearall(mp, 1); + + case XFS_IOC_FREE_EOFBLOCKS: { + struct xfs_fs_eofblocks eofb; + struct xfs_eofblocks keofb; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (mp->m_flags & XFS_MOUNT_RDONLY) + return -EROFS; + + if (copy_from_user(&eofb, arg, sizeof(eofb))) + return -EFAULT; + + error = xfs_fs_eofblocks_from_user(&eofb, &keofb); + if (error) + return error; + + return xfs_icache_free_eofblocks(mp, &keofb); + } + + default: + return -ENOTTY; + } +} diff --git a/kernel/fs/xfs/xfs_ioctl.h b/kernel/fs/xfs/xfs_ioctl.h new file mode 100644 index 000000000..77c02c790 --- /dev/null +++ b/kernel/fs/xfs/xfs_ioctl.h @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2008 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_IOCTL_H__ +#define __XFS_IOCTL_H__ + +extern int +xfs_ioc_space( + struct xfs_inode *ip, + struct inode *inode, + struct file *filp, + int ioflags, + unsigned int cmd, + xfs_flock64_t *bf); + +int +xfs_ioc_swapext( + xfs_swapext_t *sxp); + +extern int +xfs_find_handle( + unsigned int cmd, + xfs_fsop_handlereq_t *hreq); + +extern int +xfs_open_by_handle( + struct file *parfilp, + xfs_fsop_handlereq_t *hreq); + +extern int +xfs_readlink_by_handle( + struct file *parfilp, + xfs_fsop_handlereq_t *hreq); + +extern int +xfs_attrmulti_attr_get( + struct inode *inode, + unsigned char *name, + unsigned char __user *ubuf, + __uint32_t *len, + __uint32_t flags); + +extern int +xfs_attrmulti_attr_set( + struct inode *inode, + unsigned char *name, + const unsigned char __user *ubuf, + __uint32_t len, + __uint32_t flags); + +extern int +xfs_attrmulti_attr_remove( + struct inode *inode, + unsigned char *name, + __uint32_t flags); + +extern struct dentry * +xfs_handle_to_dentry( + struct file *parfilp, + void __user *uhandle, + u32 hlen); + +extern long +xfs_file_ioctl( + struct file *filp, + unsigned int cmd, + unsigned long p); + +extern long +xfs_file_compat_ioctl( + struct file *file, + unsigned int cmd, + unsigned long arg); + +extern int +xfs_set_dmattrs( + struct xfs_inode *ip, + u_int evmask, + u_int16_t state); + +#endif diff --git a/kernel/fs/xfs/xfs_ioctl32.c b/kernel/fs/xfs/xfs_ioctl32.c new file mode 100644 index 000000000..b88bdc85d --- /dev/null +++ b/kernel/fs/xfs/xfs_ioctl32.c @@ -0,0 +1,680 @@ +/* + * Copyright (c) 2004-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include +#include +#include +#include +#include +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_itable.h" +#include "xfs_error.h" +#include "xfs_fsops.h" +#include "xfs_alloc.h" +#include "xfs_rtalloc.h" +#include "xfs_attr.h" +#include "xfs_ioctl.h" +#include "xfs_ioctl32.h" +#include "xfs_trace.h" + +#define _NATIVE_IOC(cmd, type) \ + _IOC(_IOC_DIR(cmd), _IOC_TYPE(cmd), _IOC_NR(cmd), sizeof(type)) + +#ifdef BROKEN_X86_ALIGNMENT +STATIC int +xfs_compat_flock64_copyin( + xfs_flock64_t *bf, + compat_xfs_flock64_t __user *arg32) +{ + if (get_user(bf->l_type, &arg32->l_type) || + get_user(bf->l_whence, &arg32->l_whence) || + get_user(bf->l_start, &arg32->l_start) || + get_user(bf->l_len, &arg32->l_len) || + get_user(bf->l_sysid, &arg32->l_sysid) || + get_user(bf->l_pid, &arg32->l_pid) || + copy_from_user(bf->l_pad, &arg32->l_pad, 4*sizeof(u32))) + return -EFAULT; + return 0; +} + +STATIC int +xfs_compat_ioc_fsgeometry_v1( + struct xfs_mount *mp, + compat_xfs_fsop_geom_v1_t __user *arg32) +{ + xfs_fsop_geom_t fsgeo; + int error; + + error = xfs_fs_geometry(mp, &fsgeo, 3); + if (error) + return error; + /* The 32-bit variant simply has some padding at the end */ + if (copy_to_user(arg32, &fsgeo, sizeof(struct compat_xfs_fsop_geom_v1))) + return -EFAULT; + return 0; +} + +STATIC int +xfs_compat_growfs_data_copyin( + struct xfs_growfs_data *in, + compat_xfs_growfs_data_t __user *arg32) +{ + if (get_user(in->newblocks, &arg32->newblocks) || + get_user(in->imaxpct, &arg32->imaxpct)) + return -EFAULT; + return 0; +} + +STATIC int +xfs_compat_growfs_rt_copyin( + struct xfs_growfs_rt *in, + compat_xfs_growfs_rt_t __user *arg32) +{ + if (get_user(in->newblocks, &arg32->newblocks) || + get_user(in->extsize, &arg32->extsize)) + return -EFAULT; + return 0; +} + +STATIC int +xfs_inumbers_fmt_compat( + void __user *ubuffer, + const struct xfs_inogrp *buffer, + long count, + long *written) +{ + compat_xfs_inogrp_t __user *p32 = ubuffer; + long i; + + for (i = 0; i < count; i++) { + if (put_user(buffer[i].xi_startino, &p32[i].xi_startino) || + put_user(buffer[i].xi_alloccount, &p32[i].xi_alloccount) || + put_user(buffer[i].xi_allocmask, &p32[i].xi_allocmask)) + return -EFAULT; + } + *written = count * sizeof(*p32); + return 0; +} + +#else +#define xfs_inumbers_fmt_compat xfs_inumbers_fmt +#endif /* BROKEN_X86_ALIGNMENT */ + +STATIC int +xfs_ioctl32_bstime_copyin( + xfs_bstime_t *bstime, + compat_xfs_bstime_t __user *bstime32) +{ + compat_time_t sec32; /* tv_sec differs on 64 vs. 32 */ + + if (get_user(sec32, &bstime32->tv_sec) || + get_user(bstime->tv_nsec, &bstime32->tv_nsec)) + return -EFAULT; + bstime->tv_sec = sec32; + return 0; +} + +/* xfs_bstat_t has differing alignment on intel, & bstime_t sizes everywhere */ +STATIC int +xfs_ioctl32_bstat_copyin( + xfs_bstat_t *bstat, + compat_xfs_bstat_t __user *bstat32) +{ + if (get_user(bstat->bs_ino, &bstat32->bs_ino) || + get_user(bstat->bs_mode, &bstat32->bs_mode) || + get_user(bstat->bs_nlink, &bstat32->bs_nlink) || + get_user(bstat->bs_uid, &bstat32->bs_uid) || + get_user(bstat->bs_gid, &bstat32->bs_gid) || + get_user(bstat->bs_rdev, &bstat32->bs_rdev) || + get_user(bstat->bs_blksize, &bstat32->bs_blksize) || + get_user(bstat->bs_size, &bstat32->bs_size) || + xfs_ioctl32_bstime_copyin(&bstat->bs_atime, &bstat32->bs_atime) || + xfs_ioctl32_bstime_copyin(&bstat->bs_mtime, &bstat32->bs_mtime) || + xfs_ioctl32_bstime_copyin(&bstat->bs_ctime, &bstat32->bs_ctime) || + get_user(bstat->bs_blocks, &bstat32->bs_size) || + get_user(bstat->bs_xflags, &bstat32->bs_size) || + get_user(bstat->bs_extsize, &bstat32->bs_extsize) || + get_user(bstat->bs_extents, &bstat32->bs_extents) || + get_user(bstat->bs_gen, &bstat32->bs_gen) || + get_user(bstat->bs_projid_lo, &bstat32->bs_projid_lo) || + get_user(bstat->bs_projid_hi, &bstat32->bs_projid_hi) || + get_user(bstat->bs_forkoff, &bstat32->bs_forkoff) || + get_user(bstat->bs_dmevmask, &bstat32->bs_dmevmask) || + get_user(bstat->bs_dmstate, &bstat32->bs_dmstate) || + get_user(bstat->bs_aextents, &bstat32->bs_aextents)) + return -EFAULT; + return 0; +} + +/* XFS_IOC_FSBULKSTAT and friends */ + +STATIC int +xfs_bstime_store_compat( + compat_xfs_bstime_t __user *p32, + const xfs_bstime_t *p) +{ + __s32 sec32; + + sec32 = p->tv_sec; + if (put_user(sec32, &p32->tv_sec) || + put_user(p->tv_nsec, &p32->tv_nsec)) + return -EFAULT; + return 0; +} + +/* Return 0 on success or positive error (to xfs_bulkstat()) */ +STATIC int +xfs_bulkstat_one_fmt_compat( + void __user *ubuffer, + int ubsize, + int *ubused, + const xfs_bstat_t *buffer) +{ + compat_xfs_bstat_t __user *p32 = ubuffer; + + if (ubsize < sizeof(*p32)) + return -ENOMEM; + + if (put_user(buffer->bs_ino, &p32->bs_ino) || + put_user(buffer->bs_mode, &p32->bs_mode) || + put_user(buffer->bs_nlink, &p32->bs_nlink) || + put_user(buffer->bs_uid, &p32->bs_uid) || + put_user(buffer->bs_gid, &p32->bs_gid) || + put_user(buffer->bs_rdev, &p32->bs_rdev) || + put_user(buffer->bs_blksize, &p32->bs_blksize) || + put_user(buffer->bs_size, &p32->bs_size) || + xfs_bstime_store_compat(&p32->bs_atime, &buffer->bs_atime) || + xfs_bstime_store_compat(&p32->bs_mtime, &buffer->bs_mtime) || + xfs_bstime_store_compat(&p32->bs_ctime, &buffer->bs_ctime) || + put_user(buffer->bs_blocks, &p32->bs_blocks) || + put_user(buffer->bs_xflags, &p32->bs_xflags) || + put_user(buffer->bs_extsize, &p32->bs_extsize) || + put_user(buffer->bs_extents, &p32->bs_extents) || + put_user(buffer->bs_gen, &p32->bs_gen) || + put_user(buffer->bs_projid, &p32->bs_projid) || + put_user(buffer->bs_projid_hi, &p32->bs_projid_hi) || + put_user(buffer->bs_forkoff, &p32->bs_forkoff) || + put_user(buffer->bs_dmevmask, &p32->bs_dmevmask) || + put_user(buffer->bs_dmstate, &p32->bs_dmstate) || + put_user(buffer->bs_aextents, &p32->bs_aextents)) + return -EFAULT; + if (ubused) + *ubused = sizeof(*p32); + return 0; +} + +STATIC int +xfs_bulkstat_one_compat( + xfs_mount_t *mp, /* mount point for filesystem */ + xfs_ino_t ino, /* inode number to get data for */ + void __user *buffer, /* buffer to place output in */ + int ubsize, /* size of buffer */ + int *ubused, /* bytes used by me */ + int *stat) /* BULKSTAT_RV_... */ +{ + return xfs_bulkstat_one_int(mp, ino, buffer, ubsize, + xfs_bulkstat_one_fmt_compat, + ubused, stat); +} + +/* copied from xfs_ioctl.c */ +STATIC int +xfs_compat_ioc_bulkstat( + xfs_mount_t *mp, + unsigned int cmd, + compat_xfs_fsop_bulkreq_t __user *p32) +{ + u32 addr; + xfs_fsop_bulkreq_t bulkreq; + int count; /* # of records returned */ + xfs_ino_t inlast; /* last inode number */ + int done; + int error; + + /* done = 1 if there are more stats to get and if bulkstat */ + /* should be called again (unused here, but used in dmapi) */ + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + if (get_user(addr, &p32->lastip)) + return -EFAULT; + bulkreq.lastip = compat_ptr(addr); + if (get_user(bulkreq.icount, &p32->icount) || + get_user(addr, &p32->ubuffer)) + return -EFAULT; + bulkreq.ubuffer = compat_ptr(addr); + if (get_user(addr, &p32->ocount)) + return -EFAULT; + bulkreq.ocount = compat_ptr(addr); + + if (copy_from_user(&inlast, bulkreq.lastip, sizeof(__s64))) + return -EFAULT; + + if ((count = bulkreq.icount) <= 0) + return -EINVAL; + + if (bulkreq.ubuffer == NULL) + return -EINVAL; + + if (cmd == XFS_IOC_FSINUMBERS_32) { + error = xfs_inumbers(mp, &inlast, &count, + bulkreq.ubuffer, xfs_inumbers_fmt_compat); + } else if (cmd == XFS_IOC_FSBULKSTAT_SINGLE_32) { + int res; + + error = xfs_bulkstat_one_compat(mp, inlast, bulkreq.ubuffer, + sizeof(compat_xfs_bstat_t), NULL, &res); + } else if (cmd == XFS_IOC_FSBULKSTAT_32) { + error = xfs_bulkstat(mp, &inlast, &count, + xfs_bulkstat_one_compat, sizeof(compat_xfs_bstat_t), + bulkreq.ubuffer, &done); + } else + error = -EINVAL; + if (error) + return error; + + if (bulkreq.ocount != NULL) { + if (copy_to_user(bulkreq.lastip, &inlast, + sizeof(xfs_ino_t))) + return -EFAULT; + + if (copy_to_user(bulkreq.ocount, &count, sizeof(count))) + return -EFAULT; + } + + return 0; +} + +STATIC int +xfs_compat_handlereq_copyin( + xfs_fsop_handlereq_t *hreq, + compat_xfs_fsop_handlereq_t __user *arg32) +{ + compat_xfs_fsop_handlereq_t hreq32; + + if (copy_from_user(&hreq32, arg32, sizeof(compat_xfs_fsop_handlereq_t))) + return -EFAULT; + + hreq->fd = hreq32.fd; + hreq->path = compat_ptr(hreq32.path); + hreq->oflags = hreq32.oflags; + hreq->ihandle = compat_ptr(hreq32.ihandle); + hreq->ihandlen = hreq32.ihandlen; + hreq->ohandle = compat_ptr(hreq32.ohandle); + hreq->ohandlen = compat_ptr(hreq32.ohandlen); + + return 0; +} + +STATIC struct dentry * +xfs_compat_handlereq_to_dentry( + struct file *parfilp, + compat_xfs_fsop_handlereq_t *hreq) +{ + return xfs_handle_to_dentry(parfilp, + compat_ptr(hreq->ihandle), hreq->ihandlen); +} + +STATIC int +xfs_compat_attrlist_by_handle( + struct file *parfilp, + void __user *arg) +{ + int error; + attrlist_cursor_kern_t *cursor; + compat_xfs_fsop_attrlist_handlereq_t al_hreq; + struct dentry *dentry; + char *kbuf; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (copy_from_user(&al_hreq, arg, + sizeof(compat_xfs_fsop_attrlist_handlereq_t))) + return -EFAULT; + if (al_hreq.buflen < sizeof(struct attrlist) || + al_hreq.buflen > XATTR_LIST_MAX) + return -EINVAL; + + /* + * Reject flags, only allow namespaces. + */ + if (al_hreq.flags & ~(ATTR_ROOT | ATTR_SECURE)) + return -EINVAL; + + dentry = xfs_compat_handlereq_to_dentry(parfilp, &al_hreq.hreq); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + + error = -ENOMEM; + kbuf = kmem_zalloc_large(al_hreq.buflen, KM_SLEEP); + if (!kbuf) + goto out_dput; + + cursor = (attrlist_cursor_kern_t *)&al_hreq.pos; + error = xfs_attr_list(XFS_I(d_inode(dentry)), kbuf, al_hreq.buflen, + al_hreq.flags, cursor); + if (error) + goto out_kfree; + + if (copy_to_user(compat_ptr(al_hreq.buffer), kbuf, al_hreq.buflen)) + error = -EFAULT; + +out_kfree: + kmem_free(kbuf); +out_dput: + dput(dentry); + return error; +} + +STATIC int +xfs_compat_attrmulti_by_handle( + struct file *parfilp, + void __user *arg) +{ + int error; + compat_xfs_attr_multiop_t *ops; + compat_xfs_fsop_attrmulti_handlereq_t am_hreq; + struct dentry *dentry; + unsigned int i, size; + unsigned char *attr_name; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (copy_from_user(&am_hreq, arg, + sizeof(compat_xfs_fsop_attrmulti_handlereq_t))) + return -EFAULT; + + /* overflow check */ + if (am_hreq.opcount >= INT_MAX / sizeof(compat_xfs_attr_multiop_t)) + return -E2BIG; + + dentry = xfs_compat_handlereq_to_dentry(parfilp, &am_hreq.hreq); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + + error = -E2BIG; + size = am_hreq.opcount * sizeof(compat_xfs_attr_multiop_t); + if (!size || size > 16 * PAGE_SIZE) + goto out_dput; + + ops = memdup_user(compat_ptr(am_hreq.ops), size); + if (IS_ERR(ops)) { + error = PTR_ERR(ops); + goto out_dput; + } + + error = -ENOMEM; + attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL); + if (!attr_name) + goto out_kfree_ops; + + error = 0; + for (i = 0; i < am_hreq.opcount; i++) { + ops[i].am_error = strncpy_from_user((char *)attr_name, + compat_ptr(ops[i].am_attrname), + MAXNAMELEN); + if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN) + error = -ERANGE; + if (ops[i].am_error < 0) + break; + + switch (ops[i].am_opcode) { + case ATTR_OP_GET: + ops[i].am_error = xfs_attrmulti_attr_get( + d_inode(dentry), attr_name, + compat_ptr(ops[i].am_attrvalue), + &ops[i].am_length, ops[i].am_flags); + break; + case ATTR_OP_SET: + ops[i].am_error = mnt_want_write_file(parfilp); + if (ops[i].am_error) + break; + ops[i].am_error = xfs_attrmulti_attr_set( + d_inode(dentry), attr_name, + compat_ptr(ops[i].am_attrvalue), + ops[i].am_length, ops[i].am_flags); + mnt_drop_write_file(parfilp); + break; + case ATTR_OP_REMOVE: + ops[i].am_error = mnt_want_write_file(parfilp); + if (ops[i].am_error) + break; + ops[i].am_error = xfs_attrmulti_attr_remove( + d_inode(dentry), attr_name, + ops[i].am_flags); + mnt_drop_write_file(parfilp); + break; + default: + ops[i].am_error = -EINVAL; + } + } + + if (copy_to_user(compat_ptr(am_hreq.ops), ops, size)) + error = -EFAULT; + + kfree(attr_name); + out_kfree_ops: + kfree(ops); + out_dput: + dput(dentry); + return error; +} + +STATIC int +xfs_compat_fssetdm_by_handle( + struct file *parfilp, + void __user *arg) +{ + int error; + struct fsdmidata fsd; + compat_xfs_fsop_setdm_handlereq_t dmhreq; + struct dentry *dentry; + + if (!capable(CAP_MKNOD)) + return -EPERM; + if (copy_from_user(&dmhreq, arg, + sizeof(compat_xfs_fsop_setdm_handlereq_t))) + return -EFAULT; + + dentry = xfs_compat_handlereq_to_dentry(parfilp, &dmhreq.hreq); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + + if (IS_IMMUTABLE(d_inode(dentry)) || IS_APPEND(d_inode(dentry))) { + error = -EPERM; + goto out; + } + + if (copy_from_user(&fsd, compat_ptr(dmhreq.data), sizeof(fsd))) { + error = -EFAULT; + goto out; + } + + error = xfs_set_dmattrs(XFS_I(d_inode(dentry)), fsd.fsd_dmevmask, + fsd.fsd_dmstate); + +out: + dput(dentry); + return error; +} + +long +xfs_file_compat_ioctl( + struct file *filp, + unsigned cmd, + unsigned long p) +{ + struct inode *inode = file_inode(filp); + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + void __user *arg = (void __user *)p; + int ioflags = 0; + int error; + + if (filp->f_mode & FMODE_NOCMTIME) + ioflags |= XFS_IO_INVIS; + + trace_xfs_file_compat_ioctl(ip); + + switch (cmd) { + /* No size or alignment issues on any arch */ + case XFS_IOC_DIOINFO: + case XFS_IOC_FSGEOMETRY: + case XFS_IOC_FSGETXATTR: + case XFS_IOC_FSSETXATTR: + case XFS_IOC_FSGETXATTRA: + case XFS_IOC_FSSETDM: + case XFS_IOC_GETBMAP: + case XFS_IOC_GETBMAPA: + case XFS_IOC_GETBMAPX: + case XFS_IOC_FSCOUNTS: + case XFS_IOC_SET_RESBLKS: + case XFS_IOC_GET_RESBLKS: + case XFS_IOC_FSGROWFSLOG: + case XFS_IOC_GOINGDOWN: + case XFS_IOC_ERROR_INJECTION: + case XFS_IOC_ERROR_CLEARALL: + return xfs_file_ioctl(filp, cmd, p); +#ifndef BROKEN_X86_ALIGNMENT + /* These are handled fine if no alignment issues */ + case XFS_IOC_ALLOCSP: + case XFS_IOC_FREESP: + case XFS_IOC_RESVSP: + case XFS_IOC_UNRESVSP: + case XFS_IOC_ALLOCSP64: + case XFS_IOC_FREESP64: + case XFS_IOC_RESVSP64: + case XFS_IOC_UNRESVSP64: + case XFS_IOC_FSGEOMETRY_V1: + case XFS_IOC_FSGROWFSDATA: + case XFS_IOC_FSGROWFSRT: + case XFS_IOC_ZERO_RANGE: + return xfs_file_ioctl(filp, cmd, p); +#else + case XFS_IOC_ALLOCSP_32: + case XFS_IOC_FREESP_32: + case XFS_IOC_ALLOCSP64_32: + case XFS_IOC_FREESP64_32: + case XFS_IOC_RESVSP_32: + case XFS_IOC_UNRESVSP_32: + case XFS_IOC_RESVSP64_32: + case XFS_IOC_UNRESVSP64_32: + case XFS_IOC_ZERO_RANGE_32: { + struct xfs_flock64 bf; + + if (xfs_compat_flock64_copyin(&bf, arg)) + return -EFAULT; + cmd = _NATIVE_IOC(cmd, struct xfs_flock64); + return xfs_ioc_space(ip, inode, filp, ioflags, cmd, &bf); + } + case XFS_IOC_FSGEOMETRY_V1_32: + return xfs_compat_ioc_fsgeometry_v1(mp, arg); + case XFS_IOC_FSGROWFSDATA_32: { + struct xfs_growfs_data in; + + if (xfs_compat_growfs_data_copyin(&in, arg)) + return -EFAULT; + error = mnt_want_write_file(filp); + if (error) + return error; + error = xfs_growfs_data(mp, &in); + mnt_drop_write_file(filp); + return error; + } + case XFS_IOC_FSGROWFSRT_32: { + struct xfs_growfs_rt in; + + if (xfs_compat_growfs_rt_copyin(&in, arg)) + return -EFAULT; + error = mnt_want_write_file(filp); + if (error) + return error; + error = xfs_growfs_rt(mp, &in); + mnt_drop_write_file(filp); + return error; + } +#endif + /* long changes size, but xfs only copiese out 32 bits */ + case XFS_IOC_GETXFLAGS_32: + case XFS_IOC_SETXFLAGS_32: + case XFS_IOC_GETVERSION_32: + cmd = _NATIVE_IOC(cmd, long); + return xfs_file_ioctl(filp, cmd, p); + case XFS_IOC_SWAPEXT_32: { + struct xfs_swapext sxp; + struct compat_xfs_swapext __user *sxu = arg; + + /* Bulk copy in up to the sx_stat field, then copy bstat */ + if (copy_from_user(&sxp, sxu, + offsetof(struct xfs_swapext, sx_stat)) || + xfs_ioctl32_bstat_copyin(&sxp.sx_stat, &sxu->sx_stat)) + return -EFAULT; + error = mnt_want_write_file(filp); + if (error) + return error; + error = xfs_ioc_swapext(&sxp); + mnt_drop_write_file(filp); + return error; + } + case XFS_IOC_FSBULKSTAT_32: + case XFS_IOC_FSBULKSTAT_SINGLE_32: + case XFS_IOC_FSINUMBERS_32: + return xfs_compat_ioc_bulkstat(mp, cmd, arg); + case XFS_IOC_FD_TO_HANDLE_32: + case XFS_IOC_PATH_TO_HANDLE_32: + case XFS_IOC_PATH_TO_FSHANDLE_32: { + struct xfs_fsop_handlereq hreq; + + if (xfs_compat_handlereq_copyin(&hreq, arg)) + return -EFAULT; + cmd = _NATIVE_IOC(cmd, struct xfs_fsop_handlereq); + return xfs_find_handle(cmd, &hreq); + } + case XFS_IOC_OPEN_BY_HANDLE_32: { + struct xfs_fsop_handlereq hreq; + + if (xfs_compat_handlereq_copyin(&hreq, arg)) + return -EFAULT; + return xfs_open_by_handle(filp, &hreq); + } + case XFS_IOC_READLINK_BY_HANDLE_32: { + struct xfs_fsop_handlereq hreq; + + if (xfs_compat_handlereq_copyin(&hreq, arg)) + return -EFAULT; + return xfs_readlink_by_handle(filp, &hreq); + } + case XFS_IOC_ATTRLIST_BY_HANDLE_32: + return xfs_compat_attrlist_by_handle(filp, arg); + case XFS_IOC_ATTRMULTI_BY_HANDLE_32: + return xfs_compat_attrmulti_by_handle(filp, arg); + case XFS_IOC_FSSETDM_BY_HANDLE_32: + return xfs_compat_fssetdm_by_handle(filp, arg); + default: + return -ENOIOCTLCMD; + } +} diff --git a/kernel/fs/xfs/xfs_ioctl32.h b/kernel/fs/xfs/xfs_ioctl32.h new file mode 100644 index 000000000..b1bb45444 --- /dev/null +++ b/kernel/fs/xfs/xfs_ioctl32.h @@ -0,0 +1,238 @@ +/* + * Copyright (c) 2004-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_IOCTL32_H__ +#define __XFS_IOCTL32_H__ + +#include + +/* + * on 32-bit arches, ioctl argument structures may have different sizes + * and/or alignment. We define compat structures which match the + * 32-bit sizes/alignments here, and their associated ioctl numbers. + * + * xfs_ioctl32.c contains routines to copy these structures in and out. + */ + +/* stock kernel-level ioctls we support */ +#define XFS_IOC_GETXFLAGS_32 FS_IOC32_GETFLAGS +#define XFS_IOC_SETXFLAGS_32 FS_IOC32_SETFLAGS +#define XFS_IOC_GETVERSION_32 FS_IOC32_GETVERSION + +/* + * On intel, even if sizes match, alignment and/or padding may differ. + */ +#if defined(CONFIG_IA64) || defined(CONFIG_X86_64) +#define BROKEN_X86_ALIGNMENT +#define __compat_packed __attribute__((packed)) +#else +#define __compat_packed +#endif + +typedef struct compat_xfs_bstime { + compat_time_t tv_sec; /* seconds */ + __s32 tv_nsec; /* and nanoseconds */ +} compat_xfs_bstime_t; + +typedef struct compat_xfs_bstat { + __u64 bs_ino; /* inode number */ + __u16 bs_mode; /* type and mode */ + __u16 bs_nlink; /* number of links */ + __u32 bs_uid; /* user id */ + __u32 bs_gid; /* group id */ + __u32 bs_rdev; /* device value */ + __s32 bs_blksize; /* block size */ + __s64 bs_size; /* file size */ + compat_xfs_bstime_t bs_atime; /* access time */ + compat_xfs_bstime_t bs_mtime; /* modify time */ + compat_xfs_bstime_t bs_ctime; /* inode change time */ + int64_t bs_blocks; /* number of blocks */ + __u32 bs_xflags; /* extended flags */ + __s32 bs_extsize; /* extent size */ + __s32 bs_extents; /* number of extents */ + __u32 bs_gen; /* generation count */ + __u16 bs_projid_lo; /* lower part of project id */ +#define bs_projid bs_projid_lo /* (previously just bs_projid) */ + __u16 bs_forkoff; /* inode fork offset in bytes */ + __u16 bs_projid_hi; /* high part of project id */ + unsigned char bs_pad[10]; /* pad space, unused */ + __u32 bs_dmevmask; /* DMIG event mask */ + __u16 bs_dmstate; /* DMIG state info */ + __u16 bs_aextents; /* attribute number of extents */ +} __compat_packed compat_xfs_bstat_t; + +typedef struct compat_xfs_fsop_bulkreq { + compat_uptr_t lastip; /* last inode # pointer */ + __s32 icount; /* count of entries in buffer */ + compat_uptr_t ubuffer; /* user buffer for inode desc. */ + compat_uptr_t ocount; /* output count pointer */ +} compat_xfs_fsop_bulkreq_t; + +#define XFS_IOC_FSBULKSTAT_32 \ + _IOWR('X', 101, struct compat_xfs_fsop_bulkreq) +#define XFS_IOC_FSBULKSTAT_SINGLE_32 \ + _IOWR('X', 102, struct compat_xfs_fsop_bulkreq) +#define XFS_IOC_FSINUMBERS_32 \ + _IOWR('X', 103, struct compat_xfs_fsop_bulkreq) + +typedef struct compat_xfs_fsop_handlereq { + __u32 fd; /* fd for FD_TO_HANDLE */ + compat_uptr_t path; /* user pathname */ + __u32 oflags; /* open flags */ + compat_uptr_t ihandle; /* user supplied handle */ + __u32 ihandlen; /* user supplied length */ + compat_uptr_t ohandle; /* user buffer for handle */ + compat_uptr_t ohandlen; /* user buffer length */ +} compat_xfs_fsop_handlereq_t; + +#define XFS_IOC_PATH_TO_FSHANDLE_32 \ + _IOWR('X', 104, struct compat_xfs_fsop_handlereq) +#define XFS_IOC_PATH_TO_HANDLE_32 \ + _IOWR('X', 105, struct compat_xfs_fsop_handlereq) +#define XFS_IOC_FD_TO_HANDLE_32 \ + _IOWR('X', 106, struct compat_xfs_fsop_handlereq) +#define XFS_IOC_OPEN_BY_HANDLE_32 \ + _IOWR('X', 107, struct compat_xfs_fsop_handlereq) +#define XFS_IOC_READLINK_BY_HANDLE_32 \ + _IOWR('X', 108, struct compat_xfs_fsop_handlereq) + +/* The bstat field in the swapext struct needs translation */ +typedef struct compat_xfs_swapext { + __int64_t sx_version; /* version */ + __int64_t sx_fdtarget; /* fd of target file */ + __int64_t sx_fdtmp; /* fd of tmp file */ + xfs_off_t sx_offset; /* offset into file */ + xfs_off_t sx_length; /* leng from offset */ + char sx_pad[16]; /* pad space, unused */ + compat_xfs_bstat_t sx_stat; /* stat of target b4 copy */ +} __compat_packed compat_xfs_swapext_t; + +#define XFS_IOC_SWAPEXT_32 _IOWR('X', 109, struct compat_xfs_swapext) + +typedef struct compat_xfs_fsop_attrlist_handlereq { + struct compat_xfs_fsop_handlereq hreq; /* handle interface structure */ + struct xfs_attrlist_cursor pos; /* opaque cookie, list offset */ + __u32 flags; /* which namespace to use */ + __u32 buflen; /* length of buffer supplied */ + compat_uptr_t buffer; /* returned names */ +} __compat_packed compat_xfs_fsop_attrlist_handlereq_t; + +/* Note: actually this is read/write */ +#define XFS_IOC_ATTRLIST_BY_HANDLE_32 \ + _IOW('X', 122, struct compat_xfs_fsop_attrlist_handlereq) + +/* am_opcodes defined in xfs_fs.h */ +typedef struct compat_xfs_attr_multiop { + __u32 am_opcode; + __s32 am_error; + compat_uptr_t am_attrname; + compat_uptr_t am_attrvalue; + __u32 am_length; + __u32 am_flags; +} compat_xfs_attr_multiop_t; + +typedef struct compat_xfs_fsop_attrmulti_handlereq { + struct compat_xfs_fsop_handlereq hreq; /* handle interface structure */ + __u32 opcount;/* count of following multiop */ + /* ptr to compat_xfs_attr_multiop */ + compat_uptr_t ops; /* attr_multi data */ +} compat_xfs_fsop_attrmulti_handlereq_t; + +#define XFS_IOC_ATTRMULTI_BY_HANDLE_32 \ + _IOW('X', 123, struct compat_xfs_fsop_attrmulti_handlereq) + +typedef struct compat_xfs_fsop_setdm_handlereq { + struct compat_xfs_fsop_handlereq hreq; /* handle information */ + /* ptr to struct fsdmidata */ + compat_uptr_t data; /* DMAPI data */ +} compat_xfs_fsop_setdm_handlereq_t; + +#define XFS_IOC_FSSETDM_BY_HANDLE_32 \ + _IOW('X', 121, struct compat_xfs_fsop_setdm_handlereq) + +#ifdef BROKEN_X86_ALIGNMENT +/* on ia32 l_start is on a 32-bit boundary */ +typedef struct compat_xfs_flock64 { + __s16 l_type; + __s16 l_whence; + __s64 l_start __attribute__((packed)); + /* len == 0 means until end of file */ + __s64 l_len __attribute__((packed)); + __s32 l_sysid; + __u32 l_pid; + __s32 l_pad[4]; /* reserve area */ +} compat_xfs_flock64_t; + +#define XFS_IOC_ALLOCSP_32 _IOW('X', 10, struct compat_xfs_flock64) +#define XFS_IOC_FREESP_32 _IOW('X', 11, struct compat_xfs_flock64) +#define XFS_IOC_ALLOCSP64_32 _IOW('X', 36, struct compat_xfs_flock64) +#define XFS_IOC_FREESP64_32 _IOW('X', 37, struct compat_xfs_flock64) +#define XFS_IOC_RESVSP_32 _IOW('X', 40, struct compat_xfs_flock64) +#define XFS_IOC_UNRESVSP_32 _IOW('X', 41, struct compat_xfs_flock64) +#define XFS_IOC_RESVSP64_32 _IOW('X', 42, struct compat_xfs_flock64) +#define XFS_IOC_UNRESVSP64_32 _IOW('X', 43, struct compat_xfs_flock64) +#define XFS_IOC_ZERO_RANGE_32 _IOW('X', 57, struct compat_xfs_flock64) + +typedef struct compat_xfs_fsop_geom_v1 { + __u32 blocksize; /* filesystem (data) block size */ + __u32 rtextsize; /* realtime extent size */ + __u32 agblocks; /* fsblocks in an AG */ + __u32 agcount; /* number of allocation groups */ + __u32 logblocks; /* fsblocks in the log */ + __u32 sectsize; /* (data) sector size, bytes */ + __u32 inodesize; /* inode size in bytes */ + __u32 imaxpct; /* max allowed inode space(%) */ + __u64 datablocks; /* fsblocks in data subvolume */ + __u64 rtblocks; /* fsblocks in realtime subvol */ + __u64 rtextents; /* rt extents in realtime subvol*/ + __u64 logstart; /* starting fsblock of the log */ + unsigned char uuid[16]; /* unique id of the filesystem */ + __u32 sunit; /* stripe unit, fsblocks */ + __u32 swidth; /* stripe width, fsblocks */ + __s32 version; /* structure version */ + __u32 flags; /* superblock version flags */ + __u32 logsectsize; /* log sector size, bytes */ + __u32 rtsectsize; /* realtime sector size, bytes */ + __u32 dirblocksize; /* directory block size, bytes */ +} __attribute__((packed)) compat_xfs_fsop_geom_v1_t; + +#define XFS_IOC_FSGEOMETRY_V1_32 \ + _IOR('X', 100, struct compat_xfs_fsop_geom_v1) + +typedef struct compat_xfs_inogrp { + __u64 xi_startino; /* starting inode number */ + __s32 xi_alloccount; /* # bits set in allocmask */ + __u64 xi_allocmask; /* mask of allocated inodes */ +} __attribute__((packed)) compat_xfs_inogrp_t; + +/* These growfs input structures have padding on the end, so must translate */ +typedef struct compat_xfs_growfs_data { + __u64 newblocks; /* new data subvol size, fsblocks */ + __u32 imaxpct; /* new inode space percentage limit */ +} __attribute__((packed)) compat_xfs_growfs_data_t; + +typedef struct compat_xfs_growfs_rt { + __u64 newblocks; /* new realtime size, fsblocks */ + __u32 extsize; /* new realtime extent size, fsblocks */ +} __attribute__((packed)) compat_xfs_growfs_rt_t; + +#define XFS_IOC_FSGROWFSDATA_32 _IOW('X', 110, struct compat_xfs_growfs_data) +#define XFS_IOC_FSGROWFSRT_32 _IOW('X', 112, struct compat_xfs_growfs_rt) + +#endif /* BROKEN_X86_ALIGNMENT */ + +#endif /* __XFS_IOCTL32_H__ */ diff --git a/kernel/fs/xfs/xfs_iomap.c b/kernel/fs/xfs/xfs_iomap.c new file mode 100644 index 000000000..38e633bad --- /dev/null +++ b/kernel/fs/xfs/xfs_iomap.c @@ -0,0 +1,920 @@ +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_bmap.h" +#include "xfs_bmap_util.h" +#include "xfs_error.h" +#include "xfs_trans.h" +#include "xfs_trans_space.h" +#include "xfs_iomap.h" +#include "xfs_trace.h" +#include "xfs_icache.h" +#include "xfs_quota.h" +#include "xfs_dquot_item.h" +#include "xfs_dquot.h" + + +#define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \ + << mp->m_writeio_log) +#define XFS_WRITE_IMAPS XFS_BMAP_MAX_NMAP + +STATIC int +xfs_iomap_eof_align_last_fsb( + xfs_mount_t *mp, + xfs_inode_t *ip, + xfs_extlen_t extsize, + xfs_fileoff_t *last_fsb) +{ + xfs_extlen_t align = 0; + int eof, error; + + if (!XFS_IS_REALTIME_INODE(ip)) { + /* + * Round up the allocation request to a stripe unit + * (m_dalign) boundary if the file size is >= stripe unit + * size, and we are allocating past the allocation eof. + * + * If mounted with the "-o swalloc" option the alignment is + * increased from the strip unit size to the stripe width. + */ + if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC)) + align = mp->m_swidth; + else if (mp->m_dalign) + align = mp->m_dalign; + + if (align && XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, align)) + align = 0; + } + + /* + * Always round up the allocation request to an extent boundary + * (when file on a real-time subvolume or has di_extsize hint). + */ + if (extsize) { + if (align) + align = roundup_64(align, extsize); + else + align = extsize; + } + + if (align) { + xfs_fileoff_t new_last_fsb = roundup_64(*last_fsb, align); + error = xfs_bmap_eof(ip, new_last_fsb, XFS_DATA_FORK, &eof); + if (error) + return error; + if (eof) + *last_fsb = new_last_fsb; + } + return 0; +} + +STATIC int +xfs_alert_fsblock_zero( + xfs_inode_t *ip, + xfs_bmbt_irec_t *imap) +{ + xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO, + "Access to block zero in inode %llu " + "start_block: %llx start_off: %llx " + "blkcnt: %llx extent-state: %x", + (unsigned long long)ip->i_ino, + (unsigned long long)imap->br_startblock, + (unsigned long long)imap->br_startoff, + (unsigned long long)imap->br_blockcount, + imap->br_state); + return -EFSCORRUPTED; +} + +int +xfs_iomap_write_direct( + xfs_inode_t *ip, + xfs_off_t offset, + size_t count, + xfs_bmbt_irec_t *imap, + int nmaps) +{ + xfs_mount_t *mp = ip->i_mount; + xfs_fileoff_t offset_fsb; + xfs_fileoff_t last_fsb; + xfs_filblks_t count_fsb, resaligned; + xfs_fsblock_t firstfsb; + xfs_extlen_t extsz, temp; + int nimaps; + int quota_flag; + int rt; + xfs_trans_t *tp; + xfs_bmap_free_t free_list; + uint qblocks, resblks, resrtextents; + int committed; + int error; + + error = xfs_qm_dqattach(ip, 0); + if (error) + return error; + + rt = XFS_IS_REALTIME_INODE(ip); + extsz = xfs_get_extsz_hint(ip); + + offset_fsb = XFS_B_TO_FSBT(mp, offset); + last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count))); + if ((offset + count) > XFS_ISIZE(ip)) { + error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb); + if (error) + return error; + } else { + if (nmaps && (imap->br_startblock == HOLESTARTBLOCK)) + last_fsb = MIN(last_fsb, (xfs_fileoff_t) + imap->br_blockcount + + imap->br_startoff); + } + count_fsb = last_fsb - offset_fsb; + ASSERT(count_fsb > 0); + + resaligned = count_fsb; + if (unlikely(extsz)) { + if ((temp = do_mod(offset_fsb, extsz))) + resaligned += temp; + if ((temp = do_mod(resaligned, extsz))) + resaligned += extsz - temp; + } + + if (unlikely(rt)) { + resrtextents = qblocks = resaligned; + resrtextents /= mp->m_sb.sb_rextsize; + resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); + quota_flag = XFS_QMOPT_RES_RTBLKS; + } else { + resrtextents = 0; + resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned); + quota_flag = XFS_QMOPT_RES_REGBLKS; + } + + /* + * Allocate and setup the transaction + */ + tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, + resblks, resrtextents); + /* + * Check for running out of space, note: need lock to return + */ + if (error) { + xfs_trans_cancel(tp, 0); + return error; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + + error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks, 0, quota_flag); + if (error) + goto out_trans_cancel; + + xfs_trans_ijoin(tp, ip, 0); + + /* + * From this point onwards we overwrite the imap pointer that the + * caller gave to us. + */ + xfs_bmap_init(&free_list, &firstfsb); + nimaps = 1; + error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, + XFS_BMAPI_PREALLOC, &firstfsb, 0, + imap, &nimaps, &free_list); + if (error) + goto out_bmap_cancel; + + /* + * Complete the transaction + */ + error = xfs_bmap_finish(&tp, &free_list, &committed); + if (error) + goto out_bmap_cancel; + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + if (error) + goto out_unlock; + + /* + * Copy any maps to caller's array and return any error. + */ + if (nimaps == 0) { + error = -ENOSPC; + goto out_unlock; + } + + if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip))) + error = xfs_alert_fsblock_zero(ip, imap); + +out_unlock: + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; + +out_bmap_cancel: + xfs_bmap_cancel(&free_list); + xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag); +out_trans_cancel: + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); + goto out_unlock; +} + +/* + * If the caller is doing a write at the end of the file, then extend the + * allocation out to the file system's write iosize. We clean up any extra + * space left over when the file is closed in xfs_inactive(). + * + * If we find we already have delalloc preallocation beyond EOF, don't do more + * preallocation as it it not needed. + */ +STATIC int +xfs_iomap_eof_want_preallocate( + xfs_mount_t *mp, + xfs_inode_t *ip, + xfs_off_t offset, + size_t count, + xfs_bmbt_irec_t *imap, + int nimaps, + int *prealloc) +{ + xfs_fileoff_t start_fsb; + xfs_filblks_t count_fsb; + int n, error, imaps; + int found_delalloc = 0; + + *prealloc = 0; + if (offset + count <= XFS_ISIZE(ip)) + return 0; + + /* + * If the file is smaller than the minimum prealloc and we are using + * dynamic preallocation, don't do any preallocation at all as it is + * likely this is the only write to the file that is going to be done. + */ + if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) && + XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_writeio_blocks)) + return 0; + + /* + * If there are any real blocks past eof, then don't + * do any speculative allocation. + */ + start_fsb = XFS_B_TO_FSBT(mp, ((xfs_ufsize_t)(offset + count - 1))); + count_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); + while (count_fsb > 0) { + imaps = nimaps; + error = xfs_bmapi_read(ip, start_fsb, count_fsb, imap, &imaps, + 0); + if (error) + return error; + for (n = 0; n < imaps; n++) { + if ((imap[n].br_startblock != HOLESTARTBLOCK) && + (imap[n].br_startblock != DELAYSTARTBLOCK)) + return 0; + start_fsb += imap[n].br_blockcount; + count_fsb -= imap[n].br_blockcount; + + if (imap[n].br_startblock == DELAYSTARTBLOCK) + found_delalloc = 1; + } + } + if (!found_delalloc) + *prealloc = 1; + return 0; +} + +/* + * Determine the initial size of the preallocation. We are beyond the current + * EOF here, but we need to take into account whether this is a sparse write or + * an extending write when determining the preallocation size. Hence we need to + * look up the extent that ends at the current write offset and use the result + * to determine the preallocation size. + * + * If the extent is a hole, then preallocation is essentially disabled. + * Otherwise we take the size of the preceeding data extent as the basis for the + * preallocation size. If the size of the extent is greater than half the + * maximum extent length, then use the current offset as the basis. This ensures + * that for large files the preallocation size always extends to MAXEXTLEN + * rather than falling short due to things like stripe unit/width alignment of + * real extents. + */ +STATIC xfs_fsblock_t +xfs_iomap_eof_prealloc_initial_size( + struct xfs_mount *mp, + struct xfs_inode *ip, + xfs_off_t offset, + xfs_bmbt_irec_t *imap, + int nimaps) +{ + xfs_fileoff_t start_fsb; + int imaps = 1; + int error; + + ASSERT(nimaps >= imaps); + + /* if we are using a specific prealloc size, return now */ + if (mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) + return 0; + + /* If the file is small, then use the minimum prealloc */ + if (XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_dalign)) + return 0; + + /* + * As we write multiple pages, the offset will always align to the + * start of a page and hence point to a hole at EOF. i.e. if the size is + * 4096 bytes, we only have one block at FSB 0, but XFS_B_TO_FSB(4096) + * will return FSB 1. Hence if there are blocks in the file, we want to + * point to the block prior to the EOF block and not the hole that maps + * directly at @offset. + */ + start_fsb = XFS_B_TO_FSB(mp, offset); + if (start_fsb) + start_fsb--; + error = xfs_bmapi_read(ip, start_fsb, 1, imap, &imaps, XFS_BMAPI_ENTIRE); + if (error) + return 0; + + ASSERT(imaps == 1); + if (imap[0].br_startblock == HOLESTARTBLOCK) + return 0; + if (imap[0].br_blockcount <= (MAXEXTLEN >> 1)) + return imap[0].br_blockcount << 1; + return XFS_B_TO_FSB(mp, offset); +} + +STATIC bool +xfs_quota_need_throttle( + struct xfs_inode *ip, + int type, + xfs_fsblock_t alloc_blocks) +{ + struct xfs_dquot *dq = xfs_inode_dquot(ip, type); + + if (!dq || !xfs_this_quota_on(ip->i_mount, type)) + return false; + + /* no hi watermark, no throttle */ + if (!dq->q_prealloc_hi_wmark) + return false; + + /* under the lo watermark, no throttle */ + if (dq->q_res_bcount + alloc_blocks < dq->q_prealloc_lo_wmark) + return false; + + return true; +} + +STATIC void +xfs_quota_calc_throttle( + struct xfs_inode *ip, + int type, + xfs_fsblock_t *qblocks, + int *qshift, + int64_t *qfreesp) +{ + int64_t freesp; + int shift = 0; + struct xfs_dquot *dq = xfs_inode_dquot(ip, type); + + /* no dq, or over hi wmark, squash the prealloc completely */ + if (!dq || dq->q_res_bcount >= dq->q_prealloc_hi_wmark) { + *qblocks = 0; + *qfreesp = 0; + return; + } + + freesp = dq->q_prealloc_hi_wmark - dq->q_res_bcount; + if (freesp < dq->q_low_space[XFS_QLOWSP_5_PCNT]) { + shift = 2; + if (freesp < dq->q_low_space[XFS_QLOWSP_3_PCNT]) + shift += 2; + if (freesp < dq->q_low_space[XFS_QLOWSP_1_PCNT]) + shift += 2; + } + + if (freesp < *qfreesp) + *qfreesp = freesp; + + /* only overwrite the throttle values if we are more aggressive */ + if ((freesp >> shift) < (*qblocks >> *qshift)) { + *qblocks = freesp; + *qshift = shift; + } +} + +/* + * If we don't have a user specified preallocation size, dynamically increase + * the preallocation size as the size of the file grows. Cap the maximum size + * at a single extent or less if the filesystem is near full. The closer the + * filesystem is to full, the smaller the maximum prealocation. + */ +STATIC xfs_fsblock_t +xfs_iomap_prealloc_size( + struct xfs_mount *mp, + struct xfs_inode *ip, + xfs_off_t offset, + struct xfs_bmbt_irec *imap, + int nimaps) +{ + xfs_fsblock_t alloc_blocks = 0; + int shift = 0; + int64_t freesp; + xfs_fsblock_t qblocks; + int qshift = 0; + + alloc_blocks = xfs_iomap_eof_prealloc_initial_size(mp, ip, offset, + imap, nimaps); + if (!alloc_blocks) + goto check_writeio; + qblocks = alloc_blocks; + + /* + * MAXEXTLEN is not a power of two value but we round the prealloc down + * to the nearest power of two value after throttling. To prevent the + * round down from unconditionally reducing the maximum supported prealloc + * size, we round up first, apply appropriate throttling, round down and + * cap the value to MAXEXTLEN. + */ + alloc_blocks = XFS_FILEOFF_MIN(roundup_pow_of_two(MAXEXTLEN), + alloc_blocks); + + freesp = percpu_counter_read_positive(&mp->m_fdblocks); + if (freesp < mp->m_low_space[XFS_LOWSP_5_PCNT]) { + shift = 2; + if (freesp < mp->m_low_space[XFS_LOWSP_4_PCNT]) + shift++; + if (freesp < mp->m_low_space[XFS_LOWSP_3_PCNT]) + shift++; + if (freesp < mp->m_low_space[XFS_LOWSP_2_PCNT]) + shift++; + if (freesp < mp->m_low_space[XFS_LOWSP_1_PCNT]) + shift++; + } + + /* + * Check each quota to cap the prealloc size, provide a shift value to + * throttle with and adjust amount of available space. + */ + if (xfs_quota_need_throttle(ip, XFS_DQ_USER, alloc_blocks)) + xfs_quota_calc_throttle(ip, XFS_DQ_USER, &qblocks, &qshift, + &freesp); + if (xfs_quota_need_throttle(ip, XFS_DQ_GROUP, alloc_blocks)) + xfs_quota_calc_throttle(ip, XFS_DQ_GROUP, &qblocks, &qshift, + &freesp); + if (xfs_quota_need_throttle(ip, XFS_DQ_PROJ, alloc_blocks)) + xfs_quota_calc_throttle(ip, XFS_DQ_PROJ, &qblocks, &qshift, + &freesp); + + /* + * The final prealloc size is set to the minimum of free space available + * in each of the quotas and the overall filesystem. + * + * The shift throttle value is set to the maximum value as determined by + * the global low free space values and per-quota low free space values. + */ + alloc_blocks = MIN(alloc_blocks, qblocks); + shift = MAX(shift, qshift); + + if (shift) + alloc_blocks >>= shift; + /* + * rounddown_pow_of_two() returns an undefined result if we pass in + * alloc_blocks = 0. + */ + if (alloc_blocks) + alloc_blocks = rounddown_pow_of_two(alloc_blocks); + if (alloc_blocks > MAXEXTLEN) + alloc_blocks = MAXEXTLEN; + + /* + * If we are still trying to allocate more space than is + * available, squash the prealloc hard. This can happen if we + * have a large file on a small filesystem and the above + * lowspace thresholds are smaller than MAXEXTLEN. + */ + while (alloc_blocks && alloc_blocks >= freesp) + alloc_blocks >>= 4; + +check_writeio: + if (alloc_blocks < mp->m_writeio_blocks) + alloc_blocks = mp->m_writeio_blocks; + + trace_xfs_iomap_prealloc_size(ip, alloc_blocks, shift, + mp->m_writeio_blocks); + + return alloc_blocks; +} + +int +xfs_iomap_write_delay( + xfs_inode_t *ip, + xfs_off_t offset, + size_t count, + xfs_bmbt_irec_t *ret_imap) +{ + xfs_mount_t *mp = ip->i_mount; + xfs_fileoff_t offset_fsb; + xfs_fileoff_t last_fsb; + xfs_off_t aligned_offset; + xfs_fileoff_t ioalign; + xfs_extlen_t extsz; + int nimaps; + xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS]; + int prealloc; + int error; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + + /* + * Make sure that the dquots are there. This doesn't hold + * the ilock across a disk read. + */ + error = xfs_qm_dqattach_locked(ip, 0); + if (error) + return error; + + extsz = xfs_get_extsz_hint(ip); + offset_fsb = XFS_B_TO_FSBT(mp, offset); + + error = xfs_iomap_eof_want_preallocate(mp, ip, offset, count, + imap, XFS_WRITE_IMAPS, &prealloc); + if (error) + return error; + +retry: + if (prealloc) { + xfs_fsblock_t alloc_blocks; + + alloc_blocks = xfs_iomap_prealloc_size(mp, ip, offset, imap, + XFS_WRITE_IMAPS); + + aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1)); + ioalign = XFS_B_TO_FSBT(mp, aligned_offset); + last_fsb = ioalign + alloc_blocks; + } else { + last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count))); + } + + if (prealloc || extsz) { + error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb); + if (error) + return error; + } + + /* + * Make sure preallocation does not create extents beyond the range we + * actually support in this filesystem. + */ + if (last_fsb > XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes)) + last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); + + ASSERT(last_fsb > offset_fsb); + + nimaps = XFS_WRITE_IMAPS; + error = xfs_bmapi_delay(ip, offset_fsb, last_fsb - offset_fsb, + imap, &nimaps, XFS_BMAPI_ENTIRE); + switch (error) { + case 0: + case -ENOSPC: + case -EDQUOT: + break; + default: + return error; + } + + /* + * If bmapi returned us nothing, we got either ENOSPC or EDQUOT. Retry + * without EOF preallocation. + */ + if (nimaps == 0) { + trace_xfs_delalloc_enospc(ip, offset, count); + if (prealloc) { + prealloc = 0; + error = 0; + goto retry; + } + return error ? error : -ENOSPC; + } + + if (!(imap[0].br_startblock || XFS_IS_REALTIME_INODE(ip))) + return xfs_alert_fsblock_zero(ip, &imap[0]); + + /* + * Tag the inode as speculatively preallocated so we can reclaim this + * space on demand, if necessary. + */ + if (prealloc) + xfs_inode_set_eofblocks_tag(ip); + + *ret_imap = imap[0]; + return 0; +} + +/* + * Pass in a delayed allocate extent, convert it to real extents; + * return to the caller the extent we create which maps on top of + * the originating callers request. + * + * Called without a lock on the inode. + * + * We no longer bother to look at the incoming map - all we have to + * guarantee is that whatever we allocate fills the required range. + */ +int +xfs_iomap_write_allocate( + xfs_inode_t *ip, + xfs_off_t offset, + xfs_bmbt_irec_t *imap) +{ + xfs_mount_t *mp = ip->i_mount; + xfs_fileoff_t offset_fsb, last_block; + xfs_fileoff_t end_fsb, map_start_fsb; + xfs_fsblock_t first_block; + xfs_bmap_free_t free_list; + xfs_filblks_t count_fsb; + xfs_trans_t *tp; + int nimaps, committed; + int error = 0; + int nres; + + /* + * Make sure that the dquots are there. + */ + error = xfs_qm_dqattach(ip, 0); + if (error) + return error; + + offset_fsb = XFS_B_TO_FSBT(mp, offset); + count_fsb = imap->br_blockcount; + map_start_fsb = imap->br_startoff; + + XFS_STATS_ADD(xs_xstrat_bytes, XFS_FSB_TO_B(mp, count_fsb)); + + while (count_fsb != 0) { + /* + * Set up a transaction with which to allocate the + * backing store for the file. Do allocations in a + * loop until we get some space in the range we are + * interested in. The other space that might be allocated + * is in the delayed allocation extent on which we sit + * but before our buffer starts. + */ + + nimaps = 0; + while (nimaps == 0) { + tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE); + tp->t_flags |= XFS_TRANS_RESERVE; + nres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, + nres, 0); + if (error) { + xfs_trans_cancel(tp, 0); + return error; + } + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + + xfs_bmap_init(&free_list, &first_block); + + /* + * it is possible that the extents have changed since + * we did the read call as we dropped the ilock for a + * while. We have to be careful about truncates or hole + * punchs here - we are not allowed to allocate + * non-delalloc blocks here. + * + * The only protection against truncation is the pages + * for the range we are being asked to convert are + * locked and hence a truncate will block on them + * first. + * + * As a result, if we go beyond the range we really + * need and hit an delalloc extent boundary followed by + * a hole while we have excess blocks in the map, we + * will fill the hole incorrectly and overrun the + * transaction reservation. + * + * Using a single map prevents this as we are forced to + * check each map we look for overlap with the desired + * range and abort as soon as we find it. Also, given + * that we only return a single map, having one beyond + * what we can return is probably a bit silly. + * + * We also need to check that we don't go beyond EOF; + * this is a truncate optimisation as a truncate sets + * the new file size before block on the pages we + * currently have locked under writeback. Because they + * are about to be tossed, we don't need to write them + * back.... + */ + nimaps = 1; + end_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip)); + error = xfs_bmap_last_offset(ip, &last_block, + XFS_DATA_FORK); + if (error) + goto trans_cancel; + + last_block = XFS_FILEOFF_MAX(last_block, end_fsb); + if ((map_start_fsb + count_fsb) > last_block) { + count_fsb = last_block - map_start_fsb; + if (count_fsb == 0) { + error = -EAGAIN; + goto trans_cancel; + } + } + + /* + * From this point onwards we overwrite the imap + * pointer that the caller gave to us. + */ + error = xfs_bmapi_write(tp, ip, map_start_fsb, + count_fsb, 0, + &first_block, 1, + imap, &nimaps, &free_list); + if (error) + goto trans_cancel; + + error = xfs_bmap_finish(&tp, &free_list, &committed); + if (error) + goto trans_cancel; + + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + if (error) + goto error0; + + xfs_iunlock(ip, XFS_ILOCK_EXCL); + } + + /* + * See if we were able to allocate an extent that + * covers at least part of the callers request + */ + if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip))) + return xfs_alert_fsblock_zero(ip, imap); + + if ((offset_fsb >= imap->br_startoff) && + (offset_fsb < (imap->br_startoff + + imap->br_blockcount))) { + XFS_STATS_INC(xs_xstrat_quick); + return 0; + } + + /* + * So far we have not mapped the requested part of the + * file, just surrounding data, try again. + */ + count_fsb -= imap->br_blockcount; + map_start_fsb = imap->br_startoff + imap->br_blockcount; + } + +trans_cancel: + xfs_bmap_cancel(&free_list); + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); +error0: + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; +} + +int +xfs_iomap_write_unwritten( + xfs_inode_t *ip, + xfs_off_t offset, + xfs_off_t count) +{ + xfs_mount_t *mp = ip->i_mount; + xfs_fileoff_t offset_fsb; + xfs_filblks_t count_fsb; + xfs_filblks_t numblks_fsb; + xfs_fsblock_t firstfsb; + int nimaps; + xfs_trans_t *tp; + xfs_bmbt_irec_t imap; + xfs_bmap_free_t free_list; + xfs_fsize_t i_size; + uint resblks; + int committed; + int error; + + trace_xfs_unwritten_convert(ip, offset, count); + + offset_fsb = XFS_B_TO_FSBT(mp, offset); + count_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count); + count_fsb = (xfs_filblks_t)(count_fsb - offset_fsb); + + /* + * Reserve enough blocks in this transaction for two complete extent + * btree splits. We may be converting the middle part of an unwritten + * extent and in this case we will insert two new extents in the btree + * each of which could cause a full split. + * + * This reservation amount will be used in the first call to + * xfs_bmbt_split() to select an AG with enough space to satisfy the + * rest of the operation. + */ + resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1; + + do { + /* + * set up a transaction to convert the range of extents + * from unwritten to real. Do allocations in a loop until + * we have covered the range passed in. + * + * Note that we open code the transaction allocation here + * to pass KM_NOFS--we can't risk to recursing back into + * the filesystem here as we might be asked to write out + * the same inode that we complete here and might deadlock + * on the iolock. + */ + sb_start_intwrite(mp->m_super); + tp = _xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE, KM_NOFS); + tp->t_flags |= XFS_TRANS_RESERVE | XFS_TRANS_FREEZE_PROT; + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, + resblks, 0); + if (error) { + xfs_trans_cancel(tp, 0); + return error; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + + /* + * Modify the unwritten extent state of the buffer. + */ + xfs_bmap_init(&free_list, &firstfsb); + nimaps = 1; + error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, + XFS_BMAPI_CONVERT, &firstfsb, + 1, &imap, &nimaps, &free_list); + if (error) + goto error_on_bmapi_transaction; + + /* + * Log the updated inode size as we go. We have to be careful + * to only log it up to the actual write offset if it is + * halfway into a block. + */ + i_size = XFS_FSB_TO_B(mp, offset_fsb + count_fsb); + if (i_size > offset + count) + i_size = offset + count; + + i_size = xfs_new_eof(ip, i_size); + if (i_size) { + ip->i_d.di_size = i_size; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + } + + error = xfs_bmap_finish(&tp, &free_list, &committed); + if (error) + goto error_on_bmapi_transaction; + + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + if (error) + return error; + + if (!(imap.br_startblock || XFS_IS_REALTIME_INODE(ip))) + return xfs_alert_fsblock_zero(ip, &imap); + + if ((numblks_fsb = imap.br_blockcount) == 0) { + /* + * The numblks_fsb value should always get + * smaller, otherwise the loop is stuck. + */ + ASSERT(imap.br_blockcount); + break; + } + offset_fsb += numblks_fsb; + count_fsb -= numblks_fsb; + } while (count_fsb > 0); + + return 0; + +error_on_bmapi_transaction: + xfs_bmap_cancel(&free_list); + xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT)); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; +} diff --git a/kernel/fs/xfs/xfs_iomap.h b/kernel/fs/xfs/xfs_iomap.h new file mode 100644 index 000000000..8688e663d --- /dev/null +++ b/kernel/fs/xfs/xfs_iomap.h @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2003-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_IOMAP_H__ +#define __XFS_IOMAP_H__ + +struct xfs_inode; +struct xfs_bmbt_irec; + +int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t, + struct xfs_bmbt_irec *, int); +int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t, + struct xfs_bmbt_irec *); +int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, + struct xfs_bmbt_irec *); +int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t); + +#endif /* __XFS_IOMAP_H__*/ diff --git a/kernel/fs/xfs/xfs_iops.c b/kernel/fs/xfs/xfs_iops.c new file mode 100644 index 000000000..f4cd7204e --- /dev/null +++ b/kernel/fs/xfs/xfs_iops.c @@ -0,0 +1,1305 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_inode.h" +#include "xfs_bmap.h" +#include "xfs_bmap_util.h" +#include "xfs_acl.h" +#include "xfs_quota.h" +#include "xfs_error.h" +#include "xfs_attr.h" +#include "xfs_trans.h" +#include "xfs_trace.h" +#include "xfs_icache.h" +#include "xfs_symlink.h" +#include "xfs_da_btree.h" +#include "xfs_dir2.h" +#include "xfs_trans_space.h" +#include "xfs_pnfs.h" + +#include +#include +#include +#include +#include +#include +#include + +/* + * Directories have different lock order w.r.t. mmap_sem compared to regular + * files. This is due to readdir potentially triggering page faults on a user + * buffer inside filldir(), and this happens with the ilock on the directory + * held. For regular files, the lock order is the other way around - the + * mmap_sem is taken during the page fault, and then we lock the ilock to do + * block mapping. Hence we need a different class for the directory ilock so + * that lockdep can tell them apart. + */ +static struct lock_class_key xfs_nondir_ilock_class; +static struct lock_class_key xfs_dir_ilock_class; + +static int +xfs_initxattrs( + struct inode *inode, + const struct xattr *xattr_array, + void *fs_info) +{ + const struct xattr *xattr; + struct xfs_inode *ip = XFS_I(inode); + int error = 0; + + for (xattr = xattr_array; xattr->name != NULL; xattr++) { + error = xfs_attr_set(ip, xattr->name, xattr->value, + xattr->value_len, ATTR_SECURE); + if (error < 0) + break; + } + return error; +} + +/* + * Hook in SELinux. This is not quite correct yet, what we really need + * here (as we do for default ACLs) is a mechanism by which creation of + * these attrs can be journalled at inode creation time (along with the + * inode, of course, such that log replay can't cause these to be lost). + */ + +STATIC int +xfs_init_security( + struct inode *inode, + struct inode *dir, + const struct qstr *qstr) +{ + return security_inode_init_security(inode, dir, qstr, + &xfs_initxattrs, NULL); +} + +static void +xfs_dentry_to_name( + struct xfs_name *namep, + struct dentry *dentry, + int mode) +{ + namep->name = dentry->d_name.name; + namep->len = dentry->d_name.len; + namep->type = xfs_mode_to_ftype[(mode & S_IFMT) >> S_SHIFT]; +} + +STATIC void +xfs_cleanup_inode( + struct inode *dir, + struct inode *inode, + struct dentry *dentry) +{ + struct xfs_name teardown; + + /* Oh, the horror. + * If we can't add the ACL or we fail in + * xfs_init_security we must back out. + * ENOSPC can hit here, among other things. + */ + xfs_dentry_to_name(&teardown, dentry, 0); + + xfs_remove(XFS_I(dir), &teardown, XFS_I(inode)); +} + +STATIC int +xfs_generic_create( + struct inode *dir, + struct dentry *dentry, + umode_t mode, + dev_t rdev, + bool tmpfile) /* unnamed file */ +{ + struct inode *inode; + struct xfs_inode *ip = NULL; + struct posix_acl *default_acl, *acl; + struct xfs_name name; + int error; + + /* + * Irix uses Missed'em'V split, but doesn't want to see + * the upper 5 bits of (14bit) major. + */ + if (S_ISCHR(mode) || S_ISBLK(mode)) { + if (unlikely(!sysv_valid_dev(rdev) || MAJOR(rdev) & ~0x1ff)) + return -EINVAL; + rdev = sysv_encode_dev(rdev); + } else { + rdev = 0; + } + + error = posix_acl_create(dir, &mode, &default_acl, &acl); + if (error) + return error; + + if (!tmpfile) { + xfs_dentry_to_name(&name, dentry, mode); + error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip); + } else { + error = xfs_create_tmpfile(XFS_I(dir), dentry, mode, &ip); + } + if (unlikely(error)) + goto out_free_acl; + + inode = VFS_I(ip); + + error = xfs_init_security(inode, dir, &dentry->d_name); + if (unlikely(error)) + goto out_cleanup_inode; + +#ifdef CONFIG_XFS_POSIX_ACL + if (default_acl) { + error = xfs_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); + if (error) + goto out_cleanup_inode; + } + if (acl) { + error = xfs_set_acl(inode, acl, ACL_TYPE_ACCESS); + if (error) + goto out_cleanup_inode; + } +#endif + + if (tmpfile) + d_tmpfile(dentry, inode); + else + d_instantiate(dentry, inode); + + xfs_finish_inode_setup(ip); + + out_free_acl: + if (default_acl) + posix_acl_release(default_acl); + if (acl) + posix_acl_release(acl); + return error; + + out_cleanup_inode: + xfs_finish_inode_setup(ip); + if (!tmpfile) + xfs_cleanup_inode(dir, inode, dentry); + iput(inode); + goto out_free_acl; +} + +STATIC int +xfs_vn_mknod( + struct inode *dir, + struct dentry *dentry, + umode_t mode, + dev_t rdev) +{ + return xfs_generic_create(dir, dentry, mode, rdev, false); +} + +STATIC int +xfs_vn_create( + struct inode *dir, + struct dentry *dentry, + umode_t mode, + bool flags) +{ + return xfs_vn_mknod(dir, dentry, mode, 0); +} + +STATIC int +xfs_vn_mkdir( + struct inode *dir, + struct dentry *dentry, + umode_t mode) +{ + return xfs_vn_mknod(dir, dentry, mode|S_IFDIR, 0); +} + +STATIC struct dentry * +xfs_vn_lookup( + struct inode *dir, + struct dentry *dentry, + unsigned int flags) +{ + struct xfs_inode *cip; + struct xfs_name name; + int error; + + if (dentry->d_name.len >= MAXNAMELEN) + return ERR_PTR(-ENAMETOOLONG); + + xfs_dentry_to_name(&name, dentry, 0); + error = xfs_lookup(XFS_I(dir), &name, &cip, NULL); + if (unlikely(error)) { + if (unlikely(error != -ENOENT)) + return ERR_PTR(error); + d_add(dentry, NULL); + return NULL; + } + + return d_splice_alias(VFS_I(cip), dentry); +} + +STATIC struct dentry * +xfs_vn_ci_lookup( + struct inode *dir, + struct dentry *dentry, + unsigned int flags) +{ + struct xfs_inode *ip; + struct xfs_name xname; + struct xfs_name ci_name; + struct qstr dname; + int error; + + if (dentry->d_name.len >= MAXNAMELEN) + return ERR_PTR(-ENAMETOOLONG); + + xfs_dentry_to_name(&xname, dentry, 0); + error = xfs_lookup(XFS_I(dir), &xname, &ip, &ci_name); + if (unlikely(error)) { + if (unlikely(error != -ENOENT)) + return ERR_PTR(error); + /* + * call d_add(dentry, NULL) here when d_drop_negative_children + * is called in xfs_vn_mknod (ie. allow negative dentries + * with CI filesystems). + */ + return NULL; + } + + /* if exact match, just splice and exit */ + if (!ci_name.name) + return d_splice_alias(VFS_I(ip), dentry); + + /* else case-insensitive match... */ + dname.name = ci_name.name; + dname.len = ci_name.len; + dentry = d_add_ci(dentry, VFS_I(ip), &dname); + kmem_free(ci_name.name); + return dentry; +} + +STATIC int +xfs_vn_link( + struct dentry *old_dentry, + struct inode *dir, + struct dentry *dentry) +{ + struct inode *inode = d_inode(old_dentry); + struct xfs_name name; + int error; + + xfs_dentry_to_name(&name, dentry, inode->i_mode); + + error = xfs_link(XFS_I(dir), XFS_I(inode), &name); + if (unlikely(error)) + return error; + + ihold(inode); + d_instantiate(dentry, inode); + return 0; +} + +STATIC int +xfs_vn_unlink( + struct inode *dir, + struct dentry *dentry) +{ + struct xfs_name name; + int error; + + xfs_dentry_to_name(&name, dentry, 0); + + error = xfs_remove(XFS_I(dir), &name, XFS_I(d_inode(dentry))); + if (error) + return error; + + /* + * With unlink, the VFS makes the dentry "negative": no inode, + * but still hashed. This is incompatible with case-insensitive + * mode, so invalidate (unhash) the dentry in CI-mode. + */ + if (xfs_sb_version_hasasciici(&XFS_M(dir->i_sb)->m_sb)) + d_invalidate(dentry); + return 0; +} + +STATIC int +xfs_vn_symlink( + struct inode *dir, + struct dentry *dentry, + const char *symname) +{ + struct inode *inode; + struct xfs_inode *cip = NULL; + struct xfs_name name; + int error; + umode_t mode; + + mode = S_IFLNK | + (irix_symlink_mode ? 0777 & ~current_umask() : S_IRWXUGO); + xfs_dentry_to_name(&name, dentry, mode); + + error = xfs_symlink(XFS_I(dir), &name, symname, mode, &cip); + if (unlikely(error)) + goto out; + + inode = VFS_I(cip); + + error = xfs_init_security(inode, dir, &dentry->d_name); + if (unlikely(error)) + goto out_cleanup_inode; + + d_instantiate(dentry, inode); + xfs_finish_inode_setup(cip); + return 0; + + out_cleanup_inode: + xfs_finish_inode_setup(cip); + xfs_cleanup_inode(dir, inode, dentry); + iput(inode); + out: + return error; +} + +STATIC int +xfs_vn_rename( + struct inode *odir, + struct dentry *odentry, + struct inode *ndir, + struct dentry *ndentry, + unsigned int flags) +{ + struct inode *new_inode = d_inode(ndentry); + int omode = 0; + struct xfs_name oname; + struct xfs_name nname; + + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) + return -EINVAL; + + /* if we are exchanging files, we need to set i_mode of both files */ + if (flags & RENAME_EXCHANGE) + omode = d_inode(ndentry)->i_mode; + + xfs_dentry_to_name(&oname, odentry, omode); + xfs_dentry_to_name(&nname, ndentry, d_inode(odentry)->i_mode); + + return xfs_rename(XFS_I(odir), &oname, XFS_I(d_inode(odentry)), + XFS_I(ndir), &nname, + new_inode ? XFS_I(new_inode) : NULL, flags); +} + +/* + * careful here - this function can get called recursively, so + * we need to be very careful about how much stack we use. + * uio is kmalloced for this reason... + */ +STATIC void * +xfs_vn_follow_link( + struct dentry *dentry, + struct nameidata *nd) +{ + char *link; + int error = -ENOMEM; + + link = kmalloc(MAXPATHLEN+1, GFP_KERNEL); + if (!link) + goto out_err; + + error = xfs_readlink(XFS_I(d_inode(dentry)), link); + if (unlikely(error)) + goto out_kfree; + + nd_set_link(nd, link); + return NULL; + + out_kfree: + kfree(link); + out_err: + nd_set_link(nd, ERR_PTR(error)); + return NULL; +} + +STATIC int +xfs_vn_getattr( + struct vfsmount *mnt, + struct dentry *dentry, + struct kstat *stat) +{ + struct inode *inode = d_inode(dentry); + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + + trace_xfs_getattr(ip); + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + stat->size = XFS_ISIZE(ip); + stat->dev = inode->i_sb->s_dev; + stat->mode = ip->i_d.di_mode; + stat->nlink = ip->i_d.di_nlink; + stat->uid = inode->i_uid; + stat->gid = inode->i_gid; + stat->ino = ip->i_ino; + stat->atime = inode->i_atime; + stat->mtime = inode->i_mtime; + stat->ctime = inode->i_ctime; + stat->blocks = + XFS_FSB_TO_BB(mp, ip->i_d.di_nblocks + ip->i_delayed_blks); + + + switch (inode->i_mode & S_IFMT) { + case S_IFBLK: + case S_IFCHR: + stat->blksize = BLKDEV_IOSIZE; + stat->rdev = MKDEV(sysv_major(ip->i_df.if_u2.if_rdev) & 0x1ff, + sysv_minor(ip->i_df.if_u2.if_rdev)); + break; + default: + if (XFS_IS_REALTIME_INODE(ip)) { + /* + * If the file blocks are being allocated from a + * realtime volume, then return the inode's realtime + * extent size or the realtime volume's extent size. + */ + stat->blksize = + xfs_get_extsz_hint(ip) << mp->m_sb.sb_blocklog; + } else + stat->blksize = xfs_preferred_iosize(mp); + stat->rdev = 0; + break; + } + + return 0; +} + +static void +xfs_setattr_mode( + struct xfs_inode *ip, + struct iattr *iattr) +{ + struct inode *inode = VFS_I(ip); + umode_t mode = iattr->ia_mode; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + + ip->i_d.di_mode &= S_IFMT; + ip->i_d.di_mode |= mode & ~S_IFMT; + + inode->i_mode &= S_IFMT; + inode->i_mode |= mode & ~S_IFMT; +} + +void +xfs_setattr_time( + struct xfs_inode *ip, + struct iattr *iattr) +{ + struct inode *inode = VFS_I(ip); + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + + if (iattr->ia_valid & ATTR_ATIME) { + inode->i_atime = iattr->ia_atime; + ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec; + ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec; + } + if (iattr->ia_valid & ATTR_CTIME) { + inode->i_ctime = iattr->ia_ctime; + ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec; + ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec; + } + if (iattr->ia_valid & ATTR_MTIME) { + inode->i_mtime = iattr->ia_mtime; + ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec; + ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec; + } +} + +int +xfs_setattr_nonsize( + struct xfs_inode *ip, + struct iattr *iattr, + int flags) +{ + xfs_mount_t *mp = ip->i_mount; + struct inode *inode = VFS_I(ip); + int mask = iattr->ia_valid; + xfs_trans_t *tp; + int error; + kuid_t uid = GLOBAL_ROOT_UID, iuid = GLOBAL_ROOT_UID; + kgid_t gid = GLOBAL_ROOT_GID, igid = GLOBAL_ROOT_GID; + struct xfs_dquot *udqp = NULL, *gdqp = NULL; + struct xfs_dquot *olddquot1 = NULL, *olddquot2 = NULL; + + trace_xfs_setattr(ip); + + /* If acls are being inherited, we already have this checked */ + if (!(flags & XFS_ATTR_NOACL)) { + if (mp->m_flags & XFS_MOUNT_RDONLY) + return -EROFS; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + error = inode_change_ok(inode, iattr); + if (error) + return error; + } + + ASSERT((mask & ATTR_SIZE) == 0); + + /* + * If disk quotas is on, we make sure that the dquots do exist on disk, + * before we start any other transactions. Trying to do this later + * is messy. We don't care to take a readlock to look at the ids + * in inode here, because we can't hold it across the trans_reserve. + * If the IDs do change before we take the ilock, we're covered + * because the i_*dquot fields will get updated anyway. + */ + if (XFS_IS_QUOTA_ON(mp) && (mask & (ATTR_UID|ATTR_GID))) { + uint qflags = 0; + + if ((mask & ATTR_UID) && XFS_IS_UQUOTA_ON(mp)) { + uid = iattr->ia_uid; + qflags |= XFS_QMOPT_UQUOTA; + } else { + uid = inode->i_uid; + } + if ((mask & ATTR_GID) && XFS_IS_GQUOTA_ON(mp)) { + gid = iattr->ia_gid; + qflags |= XFS_QMOPT_GQUOTA; + } else { + gid = inode->i_gid; + } + + /* + * We take a reference when we initialize udqp and gdqp, + * so it is important that we never blindly double trip on + * the same variable. See xfs_create() for an example. + */ + ASSERT(udqp == NULL); + ASSERT(gdqp == NULL); + error = xfs_qm_vop_dqalloc(ip, xfs_kuid_to_uid(uid), + xfs_kgid_to_gid(gid), + xfs_get_projid(ip), + qflags, &udqp, &gdqp, NULL); + if (error) + return error; + } + + tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0); + if (error) + goto out_dqrele; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + + /* + * Change file ownership. Must be the owner or privileged. + */ + if (mask & (ATTR_UID|ATTR_GID)) { + /* + * These IDs could have changed since we last looked at them. + * But, we're assured that if the ownership did change + * while we didn't have the inode locked, inode's dquot(s) + * would have changed also. + */ + iuid = inode->i_uid; + igid = inode->i_gid; + gid = (mask & ATTR_GID) ? iattr->ia_gid : igid; + uid = (mask & ATTR_UID) ? iattr->ia_uid : iuid; + + /* + * Do a quota reservation only if uid/gid is actually + * going to change. + */ + if (XFS_IS_QUOTA_RUNNING(mp) && + ((XFS_IS_UQUOTA_ON(mp) && !uid_eq(iuid, uid)) || + (XFS_IS_GQUOTA_ON(mp) && !gid_eq(igid, gid)))) { + ASSERT(tp); + error = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp, + NULL, capable(CAP_FOWNER) ? + XFS_QMOPT_FORCE_RES : 0); + if (error) /* out of quota */ + goto out_trans_cancel; + } + } + + xfs_trans_ijoin(tp, ip, 0); + + /* + * Change file ownership. Must be the owner or privileged. + */ + if (mask & (ATTR_UID|ATTR_GID)) { + /* + * CAP_FSETID overrides the following restrictions: + * + * The set-user-ID and set-group-ID bits of a file will be + * cleared upon successful return from chown() + */ + if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) && + !capable(CAP_FSETID)) + ip->i_d.di_mode &= ~(S_ISUID|S_ISGID); + + /* + * Change the ownerships and register quota modifications + * in the transaction. + */ + if (!uid_eq(iuid, uid)) { + if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_UQUOTA_ON(mp)) { + ASSERT(mask & ATTR_UID); + ASSERT(udqp); + olddquot1 = xfs_qm_vop_chown(tp, ip, + &ip->i_udquot, udqp); + } + ip->i_d.di_uid = xfs_kuid_to_uid(uid); + inode->i_uid = uid; + } + if (!gid_eq(igid, gid)) { + if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_GQUOTA_ON(mp)) { + ASSERT(xfs_sb_version_has_pquotino(&mp->m_sb) || + !XFS_IS_PQUOTA_ON(mp)); + ASSERT(mask & ATTR_GID); + ASSERT(gdqp); + olddquot2 = xfs_qm_vop_chown(tp, ip, + &ip->i_gdquot, gdqp); + } + ip->i_d.di_gid = xfs_kgid_to_gid(gid); + inode->i_gid = gid; + } + } + + if (mask & ATTR_MODE) + xfs_setattr_mode(ip, iattr); + if (mask & (ATTR_ATIME|ATTR_CTIME|ATTR_MTIME)) + xfs_setattr_time(ip, iattr); + + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + XFS_STATS_INC(xs_ig_attrchg); + + if (mp->m_flags & XFS_MOUNT_WSYNC) + xfs_trans_set_sync(tp); + error = xfs_trans_commit(tp, 0); + + xfs_iunlock(ip, XFS_ILOCK_EXCL); + + /* + * Release any dquot(s) the inode had kept before chown. + */ + xfs_qm_dqrele(olddquot1); + xfs_qm_dqrele(olddquot2); + xfs_qm_dqrele(udqp); + xfs_qm_dqrele(gdqp); + + if (error) + return error; + + /* + * XXX(hch): Updating the ACL entries is not atomic vs the i_mode + * update. We could avoid this with linked transactions + * and passing down the transaction pointer all the way + * to attr_set. No previous user of the generic + * Posix ACL code seems to care about this issue either. + */ + if ((mask & ATTR_MODE) && !(flags & XFS_ATTR_NOACL)) { + error = posix_acl_chmod(inode, inode->i_mode); + if (error) + return error; + } + + return 0; + +out_trans_cancel: + xfs_trans_cancel(tp, 0); + xfs_iunlock(ip, XFS_ILOCK_EXCL); +out_dqrele: + xfs_qm_dqrele(udqp); + xfs_qm_dqrele(gdqp); + return error; +} + +/* + * Truncate file. Must have write permission and not be a directory. + */ +int +xfs_setattr_size( + struct xfs_inode *ip, + struct iattr *iattr) +{ + struct xfs_mount *mp = ip->i_mount; + struct inode *inode = VFS_I(ip); + xfs_off_t oldsize, newsize; + struct xfs_trans *tp; + int error; + uint lock_flags = 0; + uint commit_flags = 0; + bool did_zeroing = false; + + trace_xfs_setattr(ip); + + if (mp->m_flags & XFS_MOUNT_RDONLY) + return -EROFS; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + error = inode_change_ok(inode, iattr); + if (error) + return error; + + ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); + ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL)); + ASSERT(S_ISREG(ip->i_d.di_mode)); + ASSERT((iattr->ia_valid & (ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET| + ATTR_MTIME_SET|ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0); + + oldsize = inode->i_size; + newsize = iattr->ia_size; + + /* + * Short circuit the truncate case for zero length files. + */ + if (newsize == 0 && oldsize == 0 && ip->i_d.di_nextents == 0) { + if (!(iattr->ia_valid & (ATTR_CTIME|ATTR_MTIME))) + return 0; + + /* + * Use the regular setattr path to update the timestamps. + */ + iattr->ia_valid &= ~ATTR_SIZE; + return xfs_setattr_nonsize(ip, iattr, 0); + } + + /* + * Make sure that the dquots are attached to the inode. + */ + error = xfs_qm_dqattach(ip, 0); + if (error) + return error; + + /* + * File data changes must be complete before we start the transaction to + * modify the inode. This needs to be done before joining the inode to + * the transaction because the inode cannot be unlocked once it is a + * part of the transaction. + * + * Start with zeroing any data block beyond EOF that we may expose on + * file extension. + */ + if (newsize > oldsize) { + error = xfs_zero_eof(ip, newsize, oldsize, &did_zeroing); + if (error) + return error; + } + + /* + * We are going to log the inode size change in this transaction so + * any previous writes that are beyond the on disk EOF and the new + * EOF that have not been written out need to be written here. If we + * do not write the data out, we expose ourselves to the null files + * problem. Note that this includes any block zeroing we did above; + * otherwise those blocks may not be zeroed after a crash. + */ + if (newsize > ip->i_d.di_size && + (oldsize != ip->i_d.di_size || did_zeroing)) { + error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, + ip->i_d.di_size, newsize); + if (error) + return error; + } + + /* Now wait for all direct I/O to complete. */ + inode_dio_wait(inode); + + /* + * We've already locked out new page faults, so now we can safely remove + * pages from the page cache knowing they won't get refaulted until we + * drop the XFS_MMAP_EXCL lock after the extent manipulations are + * complete. The truncate_setsize() call also cleans partial EOF page + * PTEs on extending truncates and hence ensures sub-page block size + * filesystems are correctly handled, too. + * + * We have to do all the page cache truncate work outside the + * transaction context as the "lock" order is page lock->log space + * reservation as defined by extent allocation in the writeback path. + * Hence a truncate can fail with ENOMEM from xfs_trans_reserve(), but + * having already truncated the in-memory version of the file (i.e. made + * user visible changes). There's not much we can do about this, except + * to hope that the caller sees ENOMEM and retries the truncate + * operation. + */ + error = block_truncate_page(inode->i_mapping, newsize, xfs_get_blocks); + if (error) + return error; + truncate_setsize(inode, newsize); + + tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); + if (error) + goto out_trans_cancel; + + commit_flags = XFS_TRANS_RELEASE_LOG_RES; + lock_flags |= XFS_ILOCK_EXCL; + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + + /* + * Only change the c/mtime if we are changing the size or we are + * explicitly asked to change it. This handles the semantic difference + * between truncate() and ftruncate() as implemented in the VFS. + * + * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a + * special case where we need to update the times despite not having + * these flags set. For all other operations the VFS set these flags + * explicitly if it wants a timestamp update. + */ + if (newsize != oldsize && + !(iattr->ia_valid & (ATTR_CTIME | ATTR_MTIME))) { + iattr->ia_ctime = iattr->ia_mtime = + current_fs_time(inode->i_sb); + iattr->ia_valid |= ATTR_CTIME | ATTR_MTIME; + } + + /* + * The first thing we do is set the size to new_size permanently on + * disk. This way we don't have to worry about anyone ever being able + * to look at the data being freed even in the face of a crash. + * What we're getting around here is the case where we free a block, it + * is allocated to another file, it is written to, and then we crash. + * If the new data gets written to the file but the log buffers + * containing the free and reallocation don't, then we'd end up with + * garbage in the blocks being freed. As long as we make the new size + * permanent before actually freeing any blocks it doesn't matter if + * they get written to. + */ + ip->i_d.di_size = newsize; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + if (newsize <= oldsize) { + error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, newsize); + if (error) + goto out_trans_abort; + + /* + * Truncated "down", so we're removing references to old data + * here - if we delay flushing for a long time, we expose + * ourselves unduly to the notorious NULL files problem. So, + * we mark this inode and flush it when the file is closed, + * and do not wait the usual (long) time for writeout. + */ + xfs_iflags_set(ip, XFS_ITRUNCATED); + + /* A truncate down always removes post-EOF blocks. */ + xfs_inode_clear_eofblocks_tag(ip); + } + + if (iattr->ia_valid & ATTR_MODE) + xfs_setattr_mode(ip, iattr); + if (iattr->ia_valid & (ATTR_ATIME|ATTR_CTIME|ATTR_MTIME)) + xfs_setattr_time(ip, iattr); + + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + XFS_STATS_INC(xs_ig_attrchg); + + if (mp->m_flags & XFS_MOUNT_WSYNC) + xfs_trans_set_sync(tp); + + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); +out_unlock: + if (lock_flags) + xfs_iunlock(ip, lock_flags); + return error; + +out_trans_abort: + commit_flags |= XFS_TRANS_ABORT; +out_trans_cancel: + xfs_trans_cancel(tp, commit_flags); + goto out_unlock; +} + +STATIC int +xfs_vn_setattr( + struct dentry *dentry, + struct iattr *iattr) +{ + struct xfs_inode *ip = XFS_I(d_inode(dentry)); + int error; + + if (iattr->ia_valid & ATTR_SIZE) { + uint iolock = XFS_IOLOCK_EXCL; + + xfs_ilock(ip, iolock); + error = xfs_break_layouts(d_inode(dentry), &iolock, true); + if (!error) { + xfs_ilock(ip, XFS_MMAPLOCK_EXCL); + iolock |= XFS_MMAPLOCK_EXCL; + + error = xfs_setattr_size(ip, iattr); + } + xfs_iunlock(ip, iolock); + } else { + error = xfs_setattr_nonsize(ip, iattr, 0); + } + + return error; +} + +STATIC int +xfs_vn_update_time( + struct inode *inode, + struct timespec *now, + int flags) +{ + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + int error; + + trace_xfs_update_time(ip); + + tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0); + if (error) { + xfs_trans_cancel(tp, 0); + return error; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + if (flags & S_CTIME) { + inode->i_ctime = *now; + ip->i_d.di_ctime.t_sec = (__int32_t)now->tv_sec; + ip->i_d.di_ctime.t_nsec = (__int32_t)now->tv_nsec; + } + if (flags & S_MTIME) { + inode->i_mtime = *now; + ip->i_d.di_mtime.t_sec = (__int32_t)now->tv_sec; + ip->i_d.di_mtime.t_nsec = (__int32_t)now->tv_nsec; + } + if (flags & S_ATIME) { + inode->i_atime = *now; + ip->i_d.di_atime.t_sec = (__int32_t)now->tv_sec; + ip->i_d.di_atime.t_nsec = (__int32_t)now->tv_nsec; + } + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_log_inode(tp, ip, XFS_ILOG_TIMESTAMP); + return xfs_trans_commit(tp, 0); +} + +#define XFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR) + +/* + * Call fiemap helper to fill in user data. + * Returns positive errors to xfs_getbmap. + */ +STATIC int +xfs_fiemap_format( + void **arg, + struct getbmapx *bmv, + int *full) +{ + int error; + struct fiemap_extent_info *fieinfo = *arg; + u32 fiemap_flags = 0; + u64 logical, physical, length; + + /* Do nothing for a hole */ + if (bmv->bmv_block == -1LL) + return 0; + + logical = BBTOB(bmv->bmv_offset); + physical = BBTOB(bmv->bmv_block); + length = BBTOB(bmv->bmv_length); + + if (bmv->bmv_oflags & BMV_OF_PREALLOC) + fiemap_flags |= FIEMAP_EXTENT_UNWRITTEN; + else if (bmv->bmv_oflags & BMV_OF_DELALLOC) { + fiemap_flags |= (FIEMAP_EXTENT_DELALLOC | + FIEMAP_EXTENT_UNKNOWN); + physical = 0; /* no block yet */ + } + if (bmv->bmv_oflags & BMV_OF_LAST) + fiemap_flags |= FIEMAP_EXTENT_LAST; + + error = fiemap_fill_next_extent(fieinfo, logical, physical, + length, fiemap_flags); + if (error > 0) { + error = 0; + *full = 1; /* user array now full */ + } + + return error; +} + +STATIC int +xfs_vn_fiemap( + struct inode *inode, + struct fiemap_extent_info *fieinfo, + u64 start, + u64 length) +{ + xfs_inode_t *ip = XFS_I(inode); + struct getbmapx bm; + int error; + + error = fiemap_check_flags(fieinfo, XFS_FIEMAP_FLAGS); + if (error) + return error; + + /* Set up bmap header for xfs internal routine */ + bm.bmv_offset = BTOBBT(start); + /* Special case for whole file */ + if (length == FIEMAP_MAX_OFFSET) + bm.bmv_length = -1LL; + else + bm.bmv_length = BTOBB(start + length) - bm.bmv_offset; + + /* We add one because in getbmap world count includes the header */ + bm.bmv_count = !fieinfo->fi_extents_max ? MAXEXTNUM : + fieinfo->fi_extents_max + 1; + bm.bmv_count = min_t(__s32, bm.bmv_count, + (PAGE_SIZE * 16 / sizeof(struct getbmapx))); + bm.bmv_iflags = BMV_IF_PREALLOC | BMV_IF_NO_HOLES; + if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) + bm.bmv_iflags |= BMV_IF_ATTRFORK; + if (!(fieinfo->fi_flags & FIEMAP_FLAG_SYNC)) + bm.bmv_iflags |= BMV_IF_DELALLOC; + + error = xfs_getbmap(ip, &bm, xfs_fiemap_format, fieinfo); + if (error) + return error; + + return 0; +} + +STATIC int +xfs_vn_tmpfile( + struct inode *dir, + struct dentry *dentry, + umode_t mode) +{ + return xfs_generic_create(dir, dentry, mode, 0, true); +} + +static const struct inode_operations xfs_inode_operations = { + .get_acl = xfs_get_acl, + .set_acl = xfs_set_acl, + .getattr = xfs_vn_getattr, + .setattr = xfs_vn_setattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .removexattr = generic_removexattr, + .listxattr = xfs_vn_listxattr, + .fiemap = xfs_vn_fiemap, + .update_time = xfs_vn_update_time, +}; + +static const struct inode_operations xfs_dir_inode_operations = { + .create = xfs_vn_create, + .lookup = xfs_vn_lookup, + .link = xfs_vn_link, + .unlink = xfs_vn_unlink, + .symlink = xfs_vn_symlink, + .mkdir = xfs_vn_mkdir, + /* + * Yes, XFS uses the same method for rmdir and unlink. + * + * There are some subtile differences deeper in the code, + * but we use S_ISDIR to check for those. + */ + .rmdir = xfs_vn_unlink, + .mknod = xfs_vn_mknod, + .rename2 = xfs_vn_rename, + .get_acl = xfs_get_acl, + .set_acl = xfs_set_acl, + .getattr = xfs_vn_getattr, + .setattr = xfs_vn_setattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .removexattr = generic_removexattr, + .listxattr = xfs_vn_listxattr, + .update_time = xfs_vn_update_time, + .tmpfile = xfs_vn_tmpfile, +}; + +static const struct inode_operations xfs_dir_ci_inode_operations = { + .create = xfs_vn_create, + .lookup = xfs_vn_ci_lookup, + .link = xfs_vn_link, + .unlink = xfs_vn_unlink, + .symlink = xfs_vn_symlink, + .mkdir = xfs_vn_mkdir, + /* + * Yes, XFS uses the same method for rmdir and unlink. + * + * There are some subtile differences deeper in the code, + * but we use S_ISDIR to check for those. + */ + .rmdir = xfs_vn_unlink, + .mknod = xfs_vn_mknod, + .rename2 = xfs_vn_rename, + .get_acl = xfs_get_acl, + .set_acl = xfs_set_acl, + .getattr = xfs_vn_getattr, + .setattr = xfs_vn_setattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .removexattr = generic_removexattr, + .listxattr = xfs_vn_listxattr, + .update_time = xfs_vn_update_time, + .tmpfile = xfs_vn_tmpfile, +}; + +static const struct inode_operations xfs_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = xfs_vn_follow_link, + .put_link = kfree_put_link, + .getattr = xfs_vn_getattr, + .setattr = xfs_vn_setattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .removexattr = generic_removexattr, + .listxattr = xfs_vn_listxattr, + .update_time = xfs_vn_update_time, +}; + +STATIC void +xfs_diflags_to_iflags( + struct inode *inode, + struct xfs_inode *ip) +{ + if (ip->i_d.di_flags & XFS_DIFLAG_IMMUTABLE) + inode->i_flags |= S_IMMUTABLE; + else + inode->i_flags &= ~S_IMMUTABLE; + if (ip->i_d.di_flags & XFS_DIFLAG_APPEND) + inode->i_flags |= S_APPEND; + else + inode->i_flags &= ~S_APPEND; + if (ip->i_d.di_flags & XFS_DIFLAG_SYNC) + inode->i_flags |= S_SYNC; + else + inode->i_flags &= ~S_SYNC; + if (ip->i_d.di_flags & XFS_DIFLAG_NOATIME) + inode->i_flags |= S_NOATIME; + else + inode->i_flags &= ~S_NOATIME; +} + +/* + * Initialize the Linux inode and set up the operation vectors. + * + * When reading existing inodes from disk this is called directly from xfs_iget, + * when creating a new inode it is called from xfs_ialloc after setting up the + * inode. These callers have different criteria for clearing XFS_INEW, so leave + * it up to the caller to deal with unlocking the inode appropriately. + */ +void +xfs_setup_inode( + struct xfs_inode *ip) +{ + struct inode *inode = &ip->i_vnode; + gfp_t gfp_mask; + + inode->i_ino = ip->i_ino; + inode->i_state = I_NEW; + + inode_sb_list_add(inode); + /* make the inode look hashed for the writeback code */ + hlist_add_fake(&inode->i_hash); + + inode->i_mode = ip->i_d.di_mode; + set_nlink(inode, ip->i_d.di_nlink); + inode->i_uid = xfs_uid_to_kuid(ip->i_d.di_uid); + inode->i_gid = xfs_gid_to_kgid(ip->i_d.di_gid); + + switch (inode->i_mode & S_IFMT) { + case S_IFBLK: + case S_IFCHR: + inode->i_rdev = + MKDEV(sysv_major(ip->i_df.if_u2.if_rdev) & 0x1ff, + sysv_minor(ip->i_df.if_u2.if_rdev)); + break; + default: + inode->i_rdev = 0; + break; + } + + inode->i_generation = ip->i_d.di_gen; + i_size_write(inode, ip->i_d.di_size); + inode->i_atime.tv_sec = ip->i_d.di_atime.t_sec; + inode->i_atime.tv_nsec = ip->i_d.di_atime.t_nsec; + inode->i_mtime.tv_sec = ip->i_d.di_mtime.t_sec; + inode->i_mtime.tv_nsec = ip->i_d.di_mtime.t_nsec; + inode->i_ctime.tv_sec = ip->i_d.di_ctime.t_sec; + inode->i_ctime.tv_nsec = ip->i_d.di_ctime.t_nsec; + xfs_diflags_to_iflags(inode, ip); + + ip->d_ops = ip->i_mount->m_nondir_inode_ops; + lockdep_set_class(&ip->i_lock.mr_lock, &xfs_nondir_ilock_class); + switch (inode->i_mode & S_IFMT) { + case S_IFREG: + inode->i_op = &xfs_inode_operations; + inode->i_fop = &xfs_file_operations; + inode->i_mapping->a_ops = &xfs_address_space_operations; + break; + case S_IFDIR: + lockdep_set_class(&ip->i_lock.mr_lock, &xfs_dir_ilock_class); + if (xfs_sb_version_hasasciici(&XFS_M(inode->i_sb)->m_sb)) + inode->i_op = &xfs_dir_ci_inode_operations; + else + inode->i_op = &xfs_dir_inode_operations; + inode->i_fop = &xfs_dir_file_operations; + ip->d_ops = ip->i_mount->m_dir_inode_ops; + break; + case S_IFLNK: + inode->i_op = &xfs_symlink_inode_operations; + if (!(ip->i_df.if_flags & XFS_IFINLINE)) + inode->i_mapping->a_ops = &xfs_address_space_operations; + break; + default: + inode->i_op = &xfs_inode_operations; + init_special_inode(inode, inode->i_mode, inode->i_rdev); + break; + } + + /* + * Ensure all page cache allocations are done from GFP_NOFS context to + * prevent direct reclaim recursion back into the filesystem and blowing + * stacks or deadlocking. + */ + gfp_mask = mapping_gfp_mask(inode->i_mapping); + mapping_set_gfp_mask(inode->i_mapping, (gfp_mask & ~(__GFP_FS))); + + /* + * If there is no attribute fork no ACL can exist on this inode, + * and it can't have any file capabilities attached to it either. + */ + if (!XFS_IFORK_Q(ip)) { + inode_has_no_xattr(inode); + cache_no_acl(inode); + } +} diff --git a/kernel/fs/xfs/xfs_iops.h b/kernel/fs/xfs/xfs_iops.h new file mode 100644 index 000000000..a0f84abb0 --- /dev/null +++ b/kernel/fs/xfs/xfs_iops.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_IOPS_H__ +#define __XFS_IOPS_H__ + +struct xfs_inode; + +extern const struct file_operations xfs_file_operations; +extern const struct file_operations xfs_dir_file_operations; + +extern ssize_t xfs_vn_listxattr(struct dentry *, char *data, size_t size); + +/* + * Internal setattr interfaces. + */ +#define XFS_ATTR_NOACL 0x01 /* Don't call posix_acl_chmod */ + +extern void xfs_setattr_time(struct xfs_inode *ip, struct iattr *iattr); +extern int xfs_setattr_nonsize(struct xfs_inode *ip, struct iattr *vap, + int flags); +extern int xfs_setattr_size(struct xfs_inode *ip, struct iattr *vap); + +#endif /* __XFS_IOPS_H__ */ diff --git a/kernel/fs/xfs/xfs_itable.c b/kernel/fs/xfs/xfs_itable.c new file mode 100644 index 000000000..80429891d --- /dev/null +++ b/kernel/fs/xfs/xfs_itable.c @@ -0,0 +1,652 @@ +/* + * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_btree.h" +#include "xfs_ialloc.h" +#include "xfs_ialloc_btree.h" +#include "xfs_itable.h" +#include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_icache.h" + +STATIC int +xfs_internal_inum( + xfs_mount_t *mp, + xfs_ino_t ino) +{ + return (ino == mp->m_sb.sb_rbmino || ino == mp->m_sb.sb_rsumino || + (xfs_sb_version_hasquota(&mp->m_sb) && + xfs_is_quota_inode(&mp->m_sb, ino))); +} + +/* + * Return stat information for one inode. + * Return 0 if ok, else errno. + */ +int +xfs_bulkstat_one_int( + struct xfs_mount *mp, /* mount point for filesystem */ + xfs_ino_t ino, /* inode to get data for */ + void __user *buffer, /* buffer to place output in */ + int ubsize, /* size of buffer */ + bulkstat_one_fmt_pf formatter, /* formatter, copy to user */ + int *ubused, /* bytes used by me */ + int *stat) /* BULKSTAT_RV_... */ +{ + struct xfs_icdinode *dic; /* dinode core info pointer */ + struct xfs_inode *ip; /* incore inode pointer */ + struct xfs_bstat *buf; /* return buffer */ + int error = 0; /* error value */ + + *stat = BULKSTAT_RV_NOTHING; + + if (!buffer || xfs_internal_inum(mp, ino)) + return -EINVAL; + + buf = kmem_alloc(sizeof(*buf), KM_SLEEP | KM_MAYFAIL); + if (!buf) + return -ENOMEM; + + error = xfs_iget(mp, NULL, ino, + (XFS_IGET_DONTCACHE | XFS_IGET_UNTRUSTED), + XFS_ILOCK_SHARED, &ip); + if (error) + goto out_free; + + ASSERT(ip != NULL); + ASSERT(ip->i_imap.im_blkno != 0); + + dic = &ip->i_d; + + /* xfs_iget returns the following without needing + * further change. + */ + buf->bs_nlink = dic->di_nlink; + buf->bs_projid_lo = dic->di_projid_lo; + buf->bs_projid_hi = dic->di_projid_hi; + buf->bs_ino = ino; + buf->bs_mode = dic->di_mode; + buf->bs_uid = dic->di_uid; + buf->bs_gid = dic->di_gid; + buf->bs_size = dic->di_size; + buf->bs_atime.tv_sec = dic->di_atime.t_sec; + buf->bs_atime.tv_nsec = dic->di_atime.t_nsec; + buf->bs_mtime.tv_sec = dic->di_mtime.t_sec; + buf->bs_mtime.tv_nsec = dic->di_mtime.t_nsec; + buf->bs_ctime.tv_sec = dic->di_ctime.t_sec; + buf->bs_ctime.tv_nsec = dic->di_ctime.t_nsec; + buf->bs_xflags = xfs_ip2xflags(ip); + buf->bs_extsize = dic->di_extsize << mp->m_sb.sb_blocklog; + buf->bs_extents = dic->di_nextents; + buf->bs_gen = dic->di_gen; + memset(buf->bs_pad, 0, sizeof(buf->bs_pad)); + buf->bs_dmevmask = dic->di_dmevmask; + buf->bs_dmstate = dic->di_dmstate; + buf->bs_aextents = dic->di_anextents; + buf->bs_forkoff = XFS_IFORK_BOFF(ip); + + switch (dic->di_format) { + case XFS_DINODE_FMT_DEV: + buf->bs_rdev = ip->i_df.if_u2.if_rdev; + buf->bs_blksize = BLKDEV_IOSIZE; + buf->bs_blocks = 0; + break; + case XFS_DINODE_FMT_LOCAL: + case XFS_DINODE_FMT_UUID: + buf->bs_rdev = 0; + buf->bs_blksize = mp->m_sb.sb_blocksize; + buf->bs_blocks = 0; + break; + case XFS_DINODE_FMT_EXTENTS: + case XFS_DINODE_FMT_BTREE: + buf->bs_rdev = 0; + buf->bs_blksize = mp->m_sb.sb_blocksize; + buf->bs_blocks = dic->di_nblocks + ip->i_delayed_blks; + break; + } + xfs_iunlock(ip, XFS_ILOCK_SHARED); + IRELE(ip); + + error = formatter(buffer, ubsize, ubused, buf); + if (!error) + *stat = BULKSTAT_RV_DIDONE; + + out_free: + kmem_free(buf); + return error; +} + +/* Return 0 on success or positive error */ +STATIC int +xfs_bulkstat_one_fmt( + void __user *ubuffer, + int ubsize, + int *ubused, + const xfs_bstat_t *buffer) +{ + if (ubsize < sizeof(*buffer)) + return -ENOMEM; + if (copy_to_user(ubuffer, buffer, sizeof(*buffer))) + return -EFAULT; + if (ubused) + *ubused = sizeof(*buffer); + return 0; +} + +int +xfs_bulkstat_one( + xfs_mount_t *mp, /* mount point for filesystem */ + xfs_ino_t ino, /* inode number to get data for */ + void __user *buffer, /* buffer to place output in */ + int ubsize, /* size of buffer */ + int *ubused, /* bytes used by me */ + int *stat) /* BULKSTAT_RV_... */ +{ + return xfs_bulkstat_one_int(mp, ino, buffer, ubsize, + xfs_bulkstat_one_fmt, ubused, stat); +} + +/* + * Loop over all clusters in a chunk for a given incore inode allocation btree + * record. Do a readahead if there are any allocated inodes in that cluster. + */ +STATIC void +xfs_bulkstat_ichunk_ra( + struct xfs_mount *mp, + xfs_agnumber_t agno, + struct xfs_inobt_rec_incore *irec) +{ + xfs_agblock_t agbno; + struct blk_plug plug; + int blks_per_cluster; + int inodes_per_cluster; + int i; /* inode chunk index */ + + agbno = XFS_AGINO_TO_AGBNO(mp, irec->ir_startino); + blks_per_cluster = xfs_icluster_size_fsb(mp); + inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog; + + blk_start_plug(&plug); + for (i = 0; i < XFS_INODES_PER_CHUNK; + i += inodes_per_cluster, agbno += blks_per_cluster) { + if (xfs_inobt_maskn(i, inodes_per_cluster) & ~irec->ir_free) { + xfs_btree_reada_bufs(mp, agno, agbno, blks_per_cluster, + &xfs_inode_buf_ops); + } + } + blk_finish_plug(&plug); +} + +/* + * Lookup the inode chunk that the given inode lives in and then get the record + * if we found the chunk. If the inode was not the last in the chunk and there + * are some left allocated, update the data for the pointed-to record as well as + * return the count of grabbed inodes. + */ +STATIC int +xfs_bulkstat_grab_ichunk( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agino_t agino, /* starting inode of chunk */ + int *icount,/* return # of inodes grabbed */ + struct xfs_inobt_rec_incore *irec) /* btree record */ +{ + int idx; /* index into inode chunk */ + int stat; + int error = 0; + + /* Lookup the inode chunk that this inode lives in */ + error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &stat); + if (error) + return error; + if (!stat) { + *icount = 0; + return error; + } + + /* Get the record, should always work */ + error = xfs_inobt_get_rec(cur, irec, &stat); + if (error) + return error; + XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, stat == 1); + + /* Check if the record contains the inode in request */ + if (irec->ir_startino + XFS_INODES_PER_CHUNK <= agino) { + *icount = 0; + return 0; + } + + idx = agino - irec->ir_startino + 1; + if (idx < XFS_INODES_PER_CHUNK && + (xfs_inobt_maskn(idx, XFS_INODES_PER_CHUNK - idx) & ~irec->ir_free)) { + int i; + + /* We got a right chunk with some left inodes allocated at it. + * Grab the chunk record. Mark all the uninteresting inodes + * free -- because they're before our start point. + */ + for (i = 0; i < idx; i++) { + if (XFS_INOBT_MASK(i) & ~irec->ir_free) + irec->ir_freecount++; + } + + irec->ir_free |= xfs_inobt_maskn(0, idx); + *icount = XFS_INODES_PER_CHUNK - irec->ir_freecount; + } + + return 0; +} + +#define XFS_BULKSTAT_UBLEFT(ubleft) ((ubleft) >= statstruct_size) + +struct xfs_bulkstat_agichunk { + char __user **ac_ubuffer;/* pointer into user's buffer */ + int ac_ubleft; /* bytes left in user's buffer */ + int ac_ubelem; /* spaces used in user's buffer */ +}; + +/* + * Process inodes in chunk with a pointer to a formatter function + * that will iget the inode and fill in the appropriate structure. + */ +static int +xfs_bulkstat_ag_ichunk( + struct xfs_mount *mp, + xfs_agnumber_t agno, + struct xfs_inobt_rec_incore *irbp, + bulkstat_one_pf formatter, + size_t statstruct_size, + struct xfs_bulkstat_agichunk *acp, + xfs_agino_t *last_agino) +{ + char __user **ubufp = acp->ac_ubuffer; + int chunkidx; + int error = 0; + xfs_agino_t agino = irbp->ir_startino; + + for (chunkidx = 0; chunkidx < XFS_INODES_PER_CHUNK; + chunkidx++, agino++) { + int fmterror; + int ubused; + + /* inode won't fit in buffer, we are done */ + if (acp->ac_ubleft < statstruct_size) + break; + + /* Skip if this inode is free */ + if (XFS_INOBT_MASK(chunkidx) & irbp->ir_free) + continue; + + /* Get the inode and fill in a single buffer */ + ubused = statstruct_size; + error = formatter(mp, XFS_AGINO_TO_INO(mp, agno, agino), + *ubufp, acp->ac_ubleft, &ubused, &fmterror); + + if (fmterror == BULKSTAT_RV_GIVEUP || + (error && error != -ENOENT && error != -EINVAL)) { + acp->ac_ubleft = 0; + ASSERT(error); + break; + } + + /* be careful not to leak error if at end of chunk */ + if (fmterror == BULKSTAT_RV_NOTHING || error) { + error = 0; + continue; + } + + *ubufp += ubused; + acp->ac_ubleft -= ubused; + acp->ac_ubelem++; + } + + /* + * Post-update *last_agino. At this point, agino will always point one + * inode past the last inode we processed successfully. Hence we + * substract that inode when setting the *last_agino cursor so that we + * return the correct cookie to userspace. On the next bulkstat call, + * the inode under the lastino cookie will be skipped as we have already + * processed it here. + */ + *last_agino = agino - 1; + + return error; +} + +/* + * Return stat information in bulk (by-inode) for the filesystem. + */ +int /* error status */ +xfs_bulkstat( + xfs_mount_t *mp, /* mount point for filesystem */ + xfs_ino_t *lastinop, /* last inode returned */ + int *ubcountp, /* size of buffer/count returned */ + bulkstat_one_pf formatter, /* func that'd fill a single buf */ + size_t statstruct_size, /* sizeof struct filling */ + char __user *ubuffer, /* buffer with inode stats */ + int *done) /* 1 if there are more stats to get */ +{ + xfs_buf_t *agbp; /* agi header buffer */ + xfs_agino_t agino; /* inode # in allocation group */ + xfs_agnumber_t agno; /* allocation group number */ + xfs_btree_cur_t *cur; /* btree cursor for ialloc btree */ + size_t irbsize; /* size of irec buffer in bytes */ + xfs_inobt_rec_incore_t *irbuf; /* start of irec buffer */ + int nirbuf; /* size of irbuf */ + int ubcount; /* size of user's buffer */ + struct xfs_bulkstat_agichunk ac; + int error = 0; + + /* + * Get the last inode value, see if there's nothing to do. + */ + agno = XFS_INO_TO_AGNO(mp, *lastinop); + agino = XFS_INO_TO_AGINO(mp, *lastinop); + if (agno >= mp->m_sb.sb_agcount || + *lastinop != XFS_AGINO_TO_INO(mp, agno, agino)) { + *done = 1; + *ubcountp = 0; + return 0; + } + + ubcount = *ubcountp; /* statstruct's */ + ac.ac_ubuffer = &ubuffer; + ac.ac_ubleft = ubcount * statstruct_size; /* bytes */; + ac.ac_ubelem = 0; + + *ubcountp = 0; + *done = 0; + + irbuf = kmem_zalloc_greedy(&irbsize, PAGE_SIZE, PAGE_SIZE * 4); + if (!irbuf) + return -ENOMEM; + + nirbuf = irbsize / sizeof(*irbuf); + + /* + * Loop over the allocation groups, starting from the last + * inode returned; 0 means start of the allocation group. + */ + while (agno < mp->m_sb.sb_agcount) { + struct xfs_inobt_rec_incore *irbp = irbuf; + struct xfs_inobt_rec_incore *irbufend = irbuf + nirbuf; + bool end_of_ag = false; + int icount = 0; + int stat; + + error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp); + if (error) + break; + /* + * Allocate and initialize a btree cursor for ialloc btree. + */ + cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno, + XFS_BTNUM_INO); + if (agino > 0) { + /* + * In the middle of an allocation group, we need to get + * the remainder of the chunk we're in. + */ + struct xfs_inobt_rec_incore r; + + error = xfs_bulkstat_grab_ichunk(cur, agino, &icount, &r); + if (error) + goto del_cursor; + if (icount) { + irbp->ir_startino = r.ir_startino; + irbp->ir_freecount = r.ir_freecount; + irbp->ir_free = r.ir_free; + irbp++; + } + /* Increment to the next record */ + error = xfs_btree_increment(cur, 0, &stat); + } else { + /* Start of ag. Lookup the first inode chunk */ + error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &stat); + } + if (error || stat == 0) { + end_of_ag = true; + goto del_cursor; + } + + /* + * Loop through inode btree records in this ag, + * until we run out of inodes or space in the buffer. + */ + while (irbp < irbufend && icount < ubcount) { + struct xfs_inobt_rec_incore r; + + error = xfs_inobt_get_rec(cur, &r, &stat); + if (error || stat == 0) { + end_of_ag = true; + goto del_cursor; + } + + /* + * If this chunk has any allocated inodes, save it. + * Also start read-ahead now for this chunk. + */ + if (r.ir_freecount < XFS_INODES_PER_CHUNK) { + xfs_bulkstat_ichunk_ra(mp, agno, &r); + irbp->ir_startino = r.ir_startino; + irbp->ir_freecount = r.ir_freecount; + irbp->ir_free = r.ir_free; + irbp++; + icount += XFS_INODES_PER_CHUNK - r.ir_freecount; + } + error = xfs_btree_increment(cur, 0, &stat); + if (error || stat == 0) { + end_of_ag = true; + goto del_cursor; + } + cond_resched(); + } + + /* + * Drop the btree buffers and the agi buffer as we can't hold any + * of the locks these represent when calling iget. If there is a + * pending error, then we are done. + */ +del_cursor: + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + xfs_buf_relse(agbp); + if (error) + break; + /* + * Now format all the good inodes into the user's buffer. The + * call to xfs_bulkstat_ag_ichunk() sets up the agino pointer + * for the next loop iteration. + */ + irbufend = irbp; + for (irbp = irbuf; + irbp < irbufend && ac.ac_ubleft >= statstruct_size; + irbp++) { + error = xfs_bulkstat_ag_ichunk(mp, agno, irbp, + formatter, statstruct_size, &ac, + &agino); + if (error) + break; + + cond_resched(); + } + + /* + * If we've run out of space or had a formatting error, we + * are now done + */ + if (ac.ac_ubleft < statstruct_size || error) + break; + + if (end_of_ag) { + agno++; + agino = 0; + } + } + /* + * Done, we're either out of filesystem or space to put the data. + */ + kmem_free(irbuf); + *ubcountp = ac.ac_ubelem; + + /* + * We found some inodes, so clear the error status and return them. + * The lastino pointer will point directly at the inode that triggered + * any error that occurred, so on the next call the error will be + * triggered again and propagated to userspace as there will be no + * formatted inodes in the buffer. + */ + if (ac.ac_ubelem) + error = 0; + + /* + * If we ran out of filesystem, lastino will point off the end of + * the filesystem so the next call will return immediately. + */ + *lastinop = XFS_AGINO_TO_INO(mp, agno, agino); + if (agno >= mp->m_sb.sb_agcount) + *done = 1; + + return error; +} + +int +xfs_inumbers_fmt( + void __user *ubuffer, /* buffer to write to */ + const struct xfs_inogrp *buffer, /* buffer to read from */ + long count, /* # of elements to read */ + long *written) /* # of bytes written */ +{ + if (copy_to_user(ubuffer, buffer, count * sizeof(*buffer))) + return -EFAULT; + *written = count * sizeof(*buffer); + return 0; +} + +/* + * Return inode number table for the filesystem. + */ +int /* error status */ +xfs_inumbers( + struct xfs_mount *mp,/* mount point for filesystem */ + xfs_ino_t *lastino,/* last inode returned */ + int *count,/* size of buffer/count returned */ + void __user *ubuffer,/* buffer with inode descriptions */ + inumbers_fmt_pf formatter) +{ + xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, *lastino); + xfs_agino_t agino = XFS_INO_TO_AGINO(mp, *lastino); + struct xfs_btree_cur *cur = NULL; + struct xfs_buf *agbp = NULL; + struct xfs_inogrp *buffer; + int bcount; + int left = *count; + int bufidx = 0; + int error = 0; + + *count = 0; + if (agno >= mp->m_sb.sb_agcount || + *lastino != XFS_AGINO_TO_INO(mp, agno, agino)) + return error; + + bcount = MIN(left, (int)(PAGE_SIZE / sizeof(*buffer))); + buffer = kmem_alloc(bcount * sizeof(*buffer), KM_SLEEP); + do { + struct xfs_inobt_rec_incore r; + int stat; + + if (!agbp) { + error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp); + if (error) + break; + + cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno, + XFS_BTNUM_INO); + error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_GE, + &stat); + if (error) + break; + if (!stat) + goto next_ag; + } + + error = xfs_inobt_get_rec(cur, &r, &stat); + if (error) + break; + if (!stat) + goto next_ag; + + agino = r.ir_startino + XFS_INODES_PER_CHUNK - 1; + buffer[bufidx].xi_startino = + XFS_AGINO_TO_INO(mp, agno, r.ir_startino); + buffer[bufidx].xi_alloccount = + XFS_INODES_PER_CHUNK - r.ir_freecount; + buffer[bufidx].xi_allocmask = ~r.ir_free; + if (++bufidx == bcount) { + long written; + + error = formatter(ubuffer, buffer, bufidx, &written); + if (error) + break; + ubuffer += written; + *count += bufidx; + bufidx = 0; + } + if (!--left) + break; + + error = xfs_btree_increment(cur, 0, &stat); + if (error) + break; + if (stat) + continue; + +next_ag: + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + cur = NULL; + xfs_buf_relse(agbp); + agbp = NULL; + agino = 0; + agno++; + } while (agno < mp->m_sb.sb_agcount); + + if (!error) { + if (bufidx) { + long written; + + error = formatter(ubuffer, buffer, bufidx, &written); + if (!error) + *count += bufidx; + } + *lastino = XFS_AGINO_TO_INO(mp, agno, agino); + } + + kmem_free(buffer); + if (cur) + xfs_btree_del_cursor(cur, (error ? XFS_BTREE_ERROR : + XFS_BTREE_NOERROR)); + if (agbp) + xfs_buf_relse(agbp); + + return error; +} diff --git a/kernel/fs/xfs/xfs_itable.h b/kernel/fs/xfs/xfs_itable.h new file mode 100644 index 000000000..6ea8b3912 --- /dev/null +++ b/kernel/fs/xfs/xfs_itable.h @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2000-2001 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_ITABLE_H__ +#define __XFS_ITABLE_H__ + +/* + * xfs_bulkstat() is used to fill in xfs_bstat structures as well as dm_stat + * structures (by the dmi library). This is a pointer to a formatter function + * that will iget the inode and fill in the appropriate structure. + * see xfs_bulkstat_one() and xfs_dm_bulkstat_one() in dmapi_xfs.c + */ +typedef int (*bulkstat_one_pf)(struct xfs_mount *mp, + xfs_ino_t ino, + void __user *buffer, + int ubsize, + int *ubused, + int *stat); + +/* + * Values for stat return value. + */ +#define BULKSTAT_RV_NOTHING 0 +#define BULKSTAT_RV_DIDONE 1 +#define BULKSTAT_RV_GIVEUP 2 + +/* + * Return stat information in bulk (by-inode) for the filesystem. + */ +int /* error status */ +xfs_bulkstat( + xfs_mount_t *mp, /* mount point for filesystem */ + xfs_ino_t *lastino, /* last inode returned */ + int *count, /* size of buffer/count returned */ + bulkstat_one_pf formatter, /* func that'd fill a single buf */ + size_t statstruct_size,/* sizeof struct that we're filling */ + char __user *ubuffer,/* buffer with inode stats */ + int *done); /* 1 if there are more stats to get */ + +typedef int (*bulkstat_one_fmt_pf)( /* used size in bytes or negative error */ + void __user *ubuffer, /* buffer to write to */ + int ubsize, /* remaining user buffer sz */ + int *ubused, /* bytes used by formatter */ + const xfs_bstat_t *buffer); /* buffer to read from */ + +int +xfs_bulkstat_one_int( + xfs_mount_t *mp, + xfs_ino_t ino, + void __user *buffer, + int ubsize, + bulkstat_one_fmt_pf formatter, + int *ubused, + int *stat); + +int +xfs_bulkstat_one( + xfs_mount_t *mp, + xfs_ino_t ino, + void __user *buffer, + int ubsize, + int *ubused, + int *stat); + +typedef int (*inumbers_fmt_pf)( + void __user *ubuffer, /* buffer to write to */ + const xfs_inogrp_t *buffer, /* buffer to read from */ + long count, /* # of elements to read */ + long *written); /* # of bytes written */ + +int +xfs_inumbers_fmt( + void __user *ubuffer, /* buffer to write to */ + const xfs_inogrp_t *buffer, /* buffer to read from */ + long count, /* # of elements to read */ + long *written); /* # of bytes written */ + +int /* error status */ +xfs_inumbers( + xfs_mount_t *mp, /* mount point for filesystem */ + xfs_ino_t *last, /* last inode returned */ + int *count, /* size of buffer/count returned */ + void __user *buffer, /* buffer with inode info */ + inumbers_fmt_pf formatter); + +#endif /* __XFS_ITABLE_H__ */ diff --git a/kernel/fs/xfs/xfs_linux.h b/kernel/fs/xfs/xfs_linux.h new file mode 100644 index 000000000..7c7842c85 --- /dev/null +++ b/kernel/fs/xfs/xfs_linux.h @@ -0,0 +1,384 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_LINUX__ +#define __XFS_LINUX__ + +#include + +/* + * Kernel specific type declarations for XFS + */ +typedef signed char __int8_t; +typedef unsigned char __uint8_t; +typedef signed short int __int16_t; +typedef unsigned short int __uint16_t; +typedef signed int __int32_t; +typedef unsigned int __uint32_t; +typedef signed long long int __int64_t; +typedef unsigned long long int __uint64_t; + +typedef __uint32_t inst_t; /* an instruction */ + +typedef __s64 xfs_off_t; /* type */ +typedef unsigned long long xfs_ino_t; /* type */ +typedef __s64 xfs_daddr_t; /* type */ +typedef char * xfs_caddr_t; /* type */ +typedef __u32 xfs_dev_t; +typedef __u32 xfs_nlink_t; + +/* __psint_t is the same size as a pointer */ +#if (BITS_PER_LONG == 32) +typedef __int32_t __psint_t; +typedef __uint32_t __psunsigned_t; +#elif (BITS_PER_LONG == 64) +typedef __int64_t __psint_t; +typedef __uint64_t __psunsigned_t; +#else +#error BITS_PER_LONG must be 32 or 64 +#endif + +#include "xfs_types.h" + +#include "kmem.h" +#include "mrlock.h" +#include "uuid.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "xfs_fs.h" +#include "xfs_stats.h" +#include "xfs_sysctl.h" +#include "xfs_iops.h" +#include "xfs_aops.h" +#include "xfs_super.h" +#include "xfs_cksum.h" +#include "xfs_buf.h" +#include "xfs_message.h" + +#ifdef __BIG_ENDIAN +#define XFS_NATIVE_HOST 1 +#else +#undef XFS_NATIVE_HOST +#endif + +#define irix_sgid_inherit xfs_params.sgid_inherit.val +#define irix_symlink_mode xfs_params.symlink_mode.val +#define xfs_panic_mask xfs_params.panic_mask.val +#define xfs_error_level xfs_params.error_level.val +#define xfs_syncd_centisecs xfs_params.syncd_timer.val +#define xfs_stats_clear xfs_params.stats_clear.val +#define xfs_inherit_sync xfs_params.inherit_sync.val +#define xfs_inherit_nodump xfs_params.inherit_nodump.val +#define xfs_inherit_noatime xfs_params.inherit_noatim.val +#define xfs_inherit_nosymlinks xfs_params.inherit_nosym.val +#define xfs_rotorstep xfs_params.rotorstep.val +#define xfs_inherit_nodefrag xfs_params.inherit_nodfrg.val +#define xfs_fstrm_centisecs xfs_params.fstrm_timer.val +#define xfs_eofb_secs xfs_params.eofb_timer.val + +#define current_cpu() (raw_smp_processor_id()) +#define current_pid() (current->pid) +#define current_test_flags(f) (current->flags & (f)) +#define current_set_flags_nested(sp, f) \ + (*(sp) = current->flags, current->flags |= (f)) +#define current_clear_flags_nested(sp, f) \ + (*(sp) = current->flags, current->flags &= ~(f)) +#define current_restore_flags_nested(sp, f) \ + (current->flags = ((current->flags & ~(f)) | (*(sp) & (f)))) + +#define spinlock_destroy(lock) + +#define NBBY 8 /* number of bits per byte */ + +/* + * Size of block device i/o is parameterized here. + * Currently the system supports page-sized i/o. + */ +#define BLKDEV_IOSHIFT PAGE_CACHE_SHIFT +#define BLKDEV_IOSIZE (1<> 32; + __low = c; + if (__high) { + __upper = __high % (b); + __high = __high / (b); + } + asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (b), "0" (__low), "1" (__upper)); + asm("":"=A" (c):"a" (__low),"d" (__high)); + *(__u64 *)a = c; + return __mod; + } + } + + /* NOTREACHED */ + return 0; +} + +/* Side effect free 64 bit mod operation */ +static inline __u32 xfs_do_mod(void *a, __u32 b, int n) +{ + switch (n) { + case 4: + return *(__u32 *)a % b; + case 8: + { + unsigned long __upper, __low, __high, __mod; + __u64 c = *(__u64 *)a; + __upper = __high = c >> 32; + __low = c; + if (__high) { + __upper = __high % (b); + __high = __high / (b); + } + asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (b), "0" (__low), "1" (__upper)); + asm("":"=A" (c):"a" (__low),"d" (__high)); + return __mod; + } + } + + /* NOTREACHED */ + return 0; +} +#else +static inline __u32 xfs_do_div(void *a, __u32 b, int n) +{ + __u32 mod; + + switch (n) { + case 4: + mod = *(__u32 *)a % b; + *(__u32 *)a = *(__u32 *)a / b; + return mod; + case 8: + mod = do_div(*(__u64 *)a, b); + return mod; + } + + /* NOTREACHED */ + return 0; +} + +/* Side effect free 64 bit mod operation */ +static inline __u32 xfs_do_mod(void *a, __u32 b, int n) +{ + switch (n) { + case 4: + return *(__u32 *)a % b; + case 8: + { + __u64 c = *(__u64 *)a; + return do_div(c, b); + } + } + + /* NOTREACHED */ + return 0; +} +#endif + +#undef do_div +#define do_div(a, b) xfs_do_div(&(a), (b), sizeof(a)) +#define do_mod(a, b) xfs_do_mod(&(a), (b), sizeof(a)) + +static inline __uint64_t roundup_64(__uint64_t x, __uint32_t y) +{ + x += y - 1; + do_div(x, y); + return x * y; +} + +static inline __uint64_t howmany_64(__uint64_t x, __uint32_t y) +{ + x += y - 1; + do_div(x, y); + return x; +} + +/* ARM old ABI has some weird alignment/padding */ +#if defined(__arm__) && !defined(__ARM_EABI__) +#define __arch_pack __attribute__((packed)) +#else +#define __arch_pack +#endif + +#define ASSERT_ALWAYS(expr) \ + (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__)) + +#ifdef DEBUG +#define ASSERT(expr) \ + (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__)) + +#ifndef STATIC +# define STATIC noinline +#endif + +#else /* !DEBUG */ + +#ifdef XFS_WARN + +#define ASSERT(expr) \ + (unlikely(expr) ? (void)0 : asswarn(#expr, __FILE__, __LINE__)) + +#ifndef STATIC +# define STATIC static noinline +#endif + +#else /* !DEBUG && !XFS_WARN */ + +#define ASSERT(expr) ((void)0) + +#ifndef STATIC +# define STATIC static noinline +#endif + +#endif /* XFS_WARN */ +#endif /* DEBUG */ + +#ifdef CONFIG_XFS_RT +#define XFS_IS_REALTIME_INODE(ip) ((ip)->i_d.di_flags & XFS_DIFLAG_REALTIME) +#else +#define XFS_IS_REALTIME_INODE(ip) (0) +#endif + +#endif /* __XFS_LINUX__ */ diff --git a/kernel/fs/xfs/xfs_log.c b/kernel/fs/xfs/xfs_log.c new file mode 100644 index 000000000..bcc7cfabb --- /dev/null +++ b/kernel/fs/xfs/xfs_log.c @@ -0,0 +1,4007 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_error.h" +#include "xfs_trans.h" +#include "xfs_trans_priv.h" +#include "xfs_log.h" +#include "xfs_log_priv.h" +#include "xfs_log_recover.h" +#include "xfs_inode.h" +#include "xfs_trace.h" +#include "xfs_fsops.h" +#include "xfs_cksum.h" +#include "xfs_sysfs.h" +#include "xfs_sb.h" + +kmem_zone_t *xfs_log_ticket_zone; + +/* Local miscellaneous function prototypes */ +STATIC int +xlog_commit_record( + struct xlog *log, + struct xlog_ticket *ticket, + struct xlog_in_core **iclog, + xfs_lsn_t *commitlsnp); + +STATIC struct xlog * +xlog_alloc_log( + struct xfs_mount *mp, + struct xfs_buftarg *log_target, + xfs_daddr_t blk_offset, + int num_bblks); +STATIC int +xlog_space_left( + struct xlog *log, + atomic64_t *head); +STATIC int +xlog_sync( + struct xlog *log, + struct xlog_in_core *iclog); +STATIC void +xlog_dealloc_log( + struct xlog *log); + +/* local state machine functions */ +STATIC void xlog_state_done_syncing(xlog_in_core_t *iclog, int); +STATIC void +xlog_state_do_callback( + struct xlog *log, + int aborted, + struct xlog_in_core *iclog); +STATIC int +xlog_state_get_iclog_space( + struct xlog *log, + int len, + struct xlog_in_core **iclog, + struct xlog_ticket *ticket, + int *continued_write, + int *logoffsetp); +STATIC int +xlog_state_release_iclog( + struct xlog *log, + struct xlog_in_core *iclog); +STATIC void +xlog_state_switch_iclogs( + struct xlog *log, + struct xlog_in_core *iclog, + int eventual_size); +STATIC void +xlog_state_want_sync( + struct xlog *log, + struct xlog_in_core *iclog); + +STATIC void +xlog_grant_push_ail( + struct xlog *log, + int need_bytes); +STATIC void +xlog_regrant_reserve_log_space( + struct xlog *log, + struct xlog_ticket *ticket); +STATIC void +xlog_ungrant_log_space( + struct xlog *log, + struct xlog_ticket *ticket); + +#if defined(DEBUG) +STATIC void +xlog_verify_dest_ptr( + struct xlog *log, + char *ptr); +STATIC void +xlog_verify_grant_tail( + struct xlog *log); +STATIC void +xlog_verify_iclog( + struct xlog *log, + struct xlog_in_core *iclog, + int count, + bool syncing); +STATIC void +xlog_verify_tail_lsn( + struct xlog *log, + struct xlog_in_core *iclog, + xfs_lsn_t tail_lsn); +#else +#define xlog_verify_dest_ptr(a,b) +#define xlog_verify_grant_tail(a) +#define xlog_verify_iclog(a,b,c,d) +#define xlog_verify_tail_lsn(a,b,c) +#endif + +STATIC int +xlog_iclogs_empty( + struct xlog *log); + +static void +xlog_grant_sub_space( + struct xlog *log, + atomic64_t *head, + int bytes) +{ + int64_t head_val = atomic64_read(head); + int64_t new, old; + + do { + int cycle, space; + + xlog_crack_grant_head_val(head_val, &cycle, &space); + + space -= bytes; + if (space < 0) { + space += log->l_logsize; + cycle--; + } + + old = head_val; + new = xlog_assign_grant_head_val(cycle, space); + head_val = atomic64_cmpxchg(head, old, new); + } while (head_val != old); +} + +static void +xlog_grant_add_space( + struct xlog *log, + atomic64_t *head, + int bytes) +{ + int64_t head_val = atomic64_read(head); + int64_t new, old; + + do { + int tmp; + int cycle, space; + + xlog_crack_grant_head_val(head_val, &cycle, &space); + + tmp = log->l_logsize - space; + if (tmp > bytes) + space += bytes; + else { + space = bytes - tmp; + cycle++; + } + + old = head_val; + new = xlog_assign_grant_head_val(cycle, space); + head_val = atomic64_cmpxchg(head, old, new); + } while (head_val != old); +} + +STATIC void +xlog_grant_head_init( + struct xlog_grant_head *head) +{ + xlog_assign_grant_head(&head->grant, 1, 0); + INIT_LIST_HEAD(&head->waiters); + spin_lock_init(&head->lock); +} + +STATIC void +xlog_grant_head_wake_all( + struct xlog_grant_head *head) +{ + struct xlog_ticket *tic; + + spin_lock(&head->lock); + list_for_each_entry(tic, &head->waiters, t_queue) + wake_up_process(tic->t_task); + spin_unlock(&head->lock); +} + +static inline int +xlog_ticket_reservation( + struct xlog *log, + struct xlog_grant_head *head, + struct xlog_ticket *tic) +{ + if (head == &log->l_write_head) { + ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV); + return tic->t_unit_res; + } else { + if (tic->t_flags & XLOG_TIC_PERM_RESERV) + return tic->t_unit_res * tic->t_cnt; + else + return tic->t_unit_res; + } +} + +STATIC bool +xlog_grant_head_wake( + struct xlog *log, + struct xlog_grant_head *head, + int *free_bytes) +{ + struct xlog_ticket *tic; + int need_bytes; + + list_for_each_entry(tic, &head->waiters, t_queue) { + need_bytes = xlog_ticket_reservation(log, head, tic); + if (*free_bytes < need_bytes) + return false; + + *free_bytes -= need_bytes; + trace_xfs_log_grant_wake_up(log, tic); + wake_up_process(tic->t_task); + } + + return true; +} + +STATIC int +xlog_grant_head_wait( + struct xlog *log, + struct xlog_grant_head *head, + struct xlog_ticket *tic, + int need_bytes) __releases(&head->lock) + __acquires(&head->lock) +{ + list_add_tail(&tic->t_queue, &head->waiters); + + do { + if (XLOG_FORCED_SHUTDOWN(log)) + goto shutdown; + xlog_grant_push_ail(log, need_bytes); + + __set_current_state(TASK_UNINTERRUPTIBLE); + spin_unlock(&head->lock); + + XFS_STATS_INC(xs_sleep_logspace); + + trace_xfs_log_grant_sleep(log, tic); + schedule(); + trace_xfs_log_grant_wake(log, tic); + + spin_lock(&head->lock); + if (XLOG_FORCED_SHUTDOWN(log)) + goto shutdown; + } while (xlog_space_left(log, &head->grant) < need_bytes); + + list_del_init(&tic->t_queue); + return 0; +shutdown: + list_del_init(&tic->t_queue); + return -EIO; +} + +/* + * Atomically get the log space required for a log ticket. + * + * Once a ticket gets put onto head->waiters, it will only return after the + * needed reservation is satisfied. + * + * This function is structured so that it has a lock free fast path. This is + * necessary because every new transaction reservation will come through this + * path. Hence any lock will be globally hot if we take it unconditionally on + * every pass. + * + * As tickets are only ever moved on and off head->waiters under head->lock, we + * only need to take that lock if we are going to add the ticket to the queue + * and sleep. We can avoid taking the lock if the ticket was never added to + * head->waiters because the t_queue list head will be empty and we hold the + * only reference to it so it can safely be checked unlocked. + */ +STATIC int +xlog_grant_head_check( + struct xlog *log, + struct xlog_grant_head *head, + struct xlog_ticket *tic, + int *need_bytes) +{ + int free_bytes; + int error = 0; + + ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY)); + + /* + * If there are other waiters on the queue then give them a chance at + * logspace before us. Wake up the first waiters, if we do not wake + * up all the waiters then go to sleep waiting for more free space, + * otherwise try to get some space for this transaction. + */ + *need_bytes = xlog_ticket_reservation(log, head, tic); + free_bytes = xlog_space_left(log, &head->grant); + if (!list_empty_careful(&head->waiters)) { + spin_lock(&head->lock); + if (!xlog_grant_head_wake(log, head, &free_bytes) || + free_bytes < *need_bytes) { + error = xlog_grant_head_wait(log, head, tic, + *need_bytes); + } + spin_unlock(&head->lock); + } else if (free_bytes < *need_bytes) { + spin_lock(&head->lock); + error = xlog_grant_head_wait(log, head, tic, *need_bytes); + spin_unlock(&head->lock); + } + + return error; +} + +static void +xlog_tic_reset_res(xlog_ticket_t *tic) +{ + tic->t_res_num = 0; + tic->t_res_arr_sum = 0; + tic->t_res_num_ophdrs = 0; +} + +static void +xlog_tic_add_region(xlog_ticket_t *tic, uint len, uint type) +{ + if (tic->t_res_num == XLOG_TIC_LEN_MAX) { + /* add to overflow and start again */ + tic->t_res_o_flow += tic->t_res_arr_sum; + tic->t_res_num = 0; + tic->t_res_arr_sum = 0; + } + + tic->t_res_arr[tic->t_res_num].r_len = len; + tic->t_res_arr[tic->t_res_num].r_type = type; + tic->t_res_arr_sum += len; + tic->t_res_num++; +} + +/* + * Replenish the byte reservation required by moving the grant write head. + */ +int +xfs_log_regrant( + struct xfs_mount *mp, + struct xlog_ticket *tic) +{ + struct xlog *log = mp->m_log; + int need_bytes; + int error = 0; + + if (XLOG_FORCED_SHUTDOWN(log)) + return -EIO; + + XFS_STATS_INC(xs_try_logspace); + + /* + * This is a new transaction on the ticket, so we need to change the + * transaction ID so that the next transaction has a different TID in + * the log. Just add one to the existing tid so that we can see chains + * of rolling transactions in the log easily. + */ + tic->t_tid++; + + xlog_grant_push_ail(log, tic->t_unit_res); + + tic->t_curr_res = tic->t_unit_res; + xlog_tic_reset_res(tic); + + if (tic->t_cnt > 0) + return 0; + + trace_xfs_log_regrant(log, tic); + + error = xlog_grant_head_check(log, &log->l_write_head, tic, + &need_bytes); + if (error) + goto out_error; + + xlog_grant_add_space(log, &log->l_write_head.grant, need_bytes); + trace_xfs_log_regrant_exit(log, tic); + xlog_verify_grant_tail(log); + return 0; + +out_error: + /* + * If we are failing, make sure the ticket doesn't have any current + * reservations. We don't want to add this back when the ticket/ + * transaction gets cancelled. + */ + tic->t_curr_res = 0; + tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */ + return error; +} + +/* + * Reserve log space and return a ticket corresponding the reservation. + * + * Each reservation is going to reserve extra space for a log record header. + * When writes happen to the on-disk log, we don't subtract the length of the + * log record header from any reservation. By wasting space in each + * reservation, we prevent over allocation problems. + */ +int +xfs_log_reserve( + struct xfs_mount *mp, + int unit_bytes, + int cnt, + struct xlog_ticket **ticp, + __uint8_t client, + bool permanent, + uint t_type) +{ + struct xlog *log = mp->m_log; + struct xlog_ticket *tic; + int need_bytes; + int error = 0; + + ASSERT(client == XFS_TRANSACTION || client == XFS_LOG); + + if (XLOG_FORCED_SHUTDOWN(log)) + return -EIO; + + XFS_STATS_INC(xs_try_logspace); + + ASSERT(*ticp == NULL); + tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent, + KM_SLEEP | KM_MAYFAIL); + if (!tic) + return -ENOMEM; + + tic->t_trans_type = t_type; + *ticp = tic; + + xlog_grant_push_ail(log, tic->t_cnt ? tic->t_unit_res * tic->t_cnt + : tic->t_unit_res); + + trace_xfs_log_reserve(log, tic); + + error = xlog_grant_head_check(log, &log->l_reserve_head, tic, + &need_bytes); + if (error) + goto out_error; + + xlog_grant_add_space(log, &log->l_reserve_head.grant, need_bytes); + xlog_grant_add_space(log, &log->l_write_head.grant, need_bytes); + trace_xfs_log_reserve_exit(log, tic); + xlog_verify_grant_tail(log); + return 0; + +out_error: + /* + * If we are failing, make sure the ticket doesn't have any current + * reservations. We don't want to add this back when the ticket/ + * transaction gets cancelled. + */ + tic->t_curr_res = 0; + tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */ + return error; +} + + +/* + * NOTES: + * + * 1. currblock field gets updated at startup and after in-core logs + * marked as with WANT_SYNC. + */ + +/* + * This routine is called when a user of a log manager ticket is done with + * the reservation. If the ticket was ever used, then a commit record for + * the associated transaction is written out as a log operation header with + * no data. The flag XLOG_TIC_INITED is set when the first write occurs with + * a given ticket. If the ticket was one with a permanent reservation, then + * a few operations are done differently. Permanent reservation tickets by + * default don't release the reservation. They just commit the current + * transaction with the belief that the reservation is still needed. A flag + * must be passed in before permanent reservations are actually released. + * When these type of tickets are not released, they need to be set into + * the inited state again. By doing this, a start record will be written + * out when the next write occurs. + */ +xfs_lsn_t +xfs_log_done( + struct xfs_mount *mp, + struct xlog_ticket *ticket, + struct xlog_in_core **iclog, + uint flags) +{ + struct xlog *log = mp->m_log; + xfs_lsn_t lsn = 0; + + if (XLOG_FORCED_SHUTDOWN(log) || + /* + * If nothing was ever written, don't write out commit record. + * If we get an error, just continue and give back the log ticket. + */ + (((ticket->t_flags & XLOG_TIC_INITED) == 0) && + (xlog_commit_record(log, ticket, iclog, &lsn)))) { + lsn = (xfs_lsn_t) -1; + if (ticket->t_flags & XLOG_TIC_PERM_RESERV) { + flags |= XFS_LOG_REL_PERM_RESERV; + } + } + + + if ((ticket->t_flags & XLOG_TIC_PERM_RESERV) == 0 || + (flags & XFS_LOG_REL_PERM_RESERV)) { + trace_xfs_log_done_nonperm(log, ticket); + + /* + * Release ticket if not permanent reservation or a specific + * request has been made to release a permanent reservation. + */ + xlog_ungrant_log_space(log, ticket); + xfs_log_ticket_put(ticket); + } else { + trace_xfs_log_done_perm(log, ticket); + + xlog_regrant_reserve_log_space(log, ticket); + /* If this ticket was a permanent reservation and we aren't + * trying to release it, reset the inited flags; so next time + * we write, a start record will be written out. + */ + ticket->t_flags |= XLOG_TIC_INITED; + } + + return lsn; +} + +/* + * Attaches a new iclog I/O completion callback routine during + * transaction commit. If the log is in error state, a non-zero + * return code is handed back and the caller is responsible for + * executing the callback at an appropriate time. + */ +int +xfs_log_notify( + struct xfs_mount *mp, + struct xlog_in_core *iclog, + xfs_log_callback_t *cb) +{ + int abortflg; + + spin_lock(&iclog->ic_callback_lock); + abortflg = (iclog->ic_state & XLOG_STATE_IOERROR); + if (!abortflg) { + ASSERT_ALWAYS((iclog->ic_state == XLOG_STATE_ACTIVE) || + (iclog->ic_state == XLOG_STATE_WANT_SYNC)); + cb->cb_next = NULL; + *(iclog->ic_callback_tail) = cb; + iclog->ic_callback_tail = &(cb->cb_next); + } + spin_unlock(&iclog->ic_callback_lock); + return abortflg; +} + +int +xfs_log_release_iclog( + struct xfs_mount *mp, + struct xlog_in_core *iclog) +{ + if (xlog_state_release_iclog(mp->m_log, iclog)) { + xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); + return -EIO; + } + + return 0; +} + +/* + * Mount a log filesystem + * + * mp - ubiquitous xfs mount point structure + * log_target - buftarg of on-disk log device + * blk_offset - Start block # where block size is 512 bytes (BBSIZE) + * num_bblocks - Number of BBSIZE blocks in on-disk log + * + * Return error or zero. + */ +int +xfs_log_mount( + xfs_mount_t *mp, + xfs_buftarg_t *log_target, + xfs_daddr_t blk_offset, + int num_bblks) +{ + int error = 0; + int min_logfsbs; + + if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) { + xfs_notice(mp, "Mounting V%d Filesystem", + XFS_SB_VERSION_NUM(&mp->m_sb)); + } else { + xfs_notice(mp, +"Mounting V%d filesystem in no-recovery mode. Filesystem will be inconsistent.", + XFS_SB_VERSION_NUM(&mp->m_sb)); + ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); + } + + mp->m_log = xlog_alloc_log(mp, log_target, blk_offset, num_bblks); + if (IS_ERR(mp->m_log)) { + error = PTR_ERR(mp->m_log); + goto out; + } + + /* + * Validate the given log space and drop a critical message via syslog + * if the log size is too small that would lead to some unexpected + * situations in transaction log space reservation stage. + * + * Note: we can't just reject the mount if the validation fails. This + * would mean that people would have to downgrade their kernel just to + * remedy the situation as there is no way to grow the log (short of + * black magic surgery with xfs_db). + * + * We can, however, reject mounts for CRC format filesystems, as the + * mkfs binary being used to make the filesystem should never create a + * filesystem with a log that is too small. + */ + min_logfsbs = xfs_log_calc_minimum_size(mp); + + if (mp->m_sb.sb_logblocks < min_logfsbs) { + xfs_warn(mp, + "Log size %d blocks too small, minimum size is %d blocks", + mp->m_sb.sb_logblocks, min_logfsbs); + error = -EINVAL; + } else if (mp->m_sb.sb_logblocks > XFS_MAX_LOG_BLOCKS) { + xfs_warn(mp, + "Log size %d blocks too large, maximum size is %lld blocks", + mp->m_sb.sb_logblocks, XFS_MAX_LOG_BLOCKS); + error = -EINVAL; + } else if (XFS_FSB_TO_B(mp, mp->m_sb.sb_logblocks) > XFS_MAX_LOG_BYTES) { + xfs_warn(mp, + "log size %lld bytes too large, maximum size is %lld bytes", + XFS_FSB_TO_B(mp, mp->m_sb.sb_logblocks), + XFS_MAX_LOG_BYTES); + error = -EINVAL; + } + if (error) { + if (xfs_sb_version_hascrc(&mp->m_sb)) { + xfs_crit(mp, "AAIEEE! Log failed size checks. Abort!"); + ASSERT(0); + goto out_free_log; + } + xfs_crit(mp, +"Log size out of supported range. Continuing onwards, but if log hangs are\n" +"experienced then please report this message in the bug report."); + } + + /* + * Initialize the AIL now we have a log. + */ + error = xfs_trans_ail_init(mp); + if (error) { + xfs_warn(mp, "AIL initialisation failed: error %d", error); + goto out_free_log; + } + mp->m_log->l_ailp = mp->m_ail; + + /* + * skip log recovery on a norecovery mount. pretend it all + * just worked. + */ + if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) { + int readonly = (mp->m_flags & XFS_MOUNT_RDONLY); + + if (readonly) + mp->m_flags &= ~XFS_MOUNT_RDONLY; + + error = xlog_recover(mp->m_log); + + if (readonly) + mp->m_flags |= XFS_MOUNT_RDONLY; + if (error) { + xfs_warn(mp, "log mount/recovery failed: error %d", + error); + goto out_destroy_ail; + } + } + + error = xfs_sysfs_init(&mp->m_log->l_kobj, &xfs_log_ktype, &mp->m_kobj, + "log"); + if (error) + goto out_destroy_ail; + + /* Normal transactions can now occur */ + mp->m_log->l_flags &= ~XLOG_ACTIVE_RECOVERY; + + /* + * Now the log has been fully initialised and we know were our + * space grant counters are, we can initialise the permanent ticket + * needed for delayed logging to work. + */ + xlog_cil_init_post_recovery(mp->m_log); + + return 0; + +out_destroy_ail: + xfs_trans_ail_destroy(mp); +out_free_log: + xlog_dealloc_log(mp->m_log); +out: + return error; +} + +/* + * Finish the recovery of the file system. This is separate from the + * xfs_log_mount() call, because it depends on the code in xfs_mountfs() to read + * in the root and real-time bitmap inodes between calling xfs_log_mount() and + * here. + * + * If we finish recovery successfully, start the background log work. If we are + * not doing recovery, then we have a RO filesystem and we don't need to start + * it. + */ +int +xfs_log_mount_finish(xfs_mount_t *mp) +{ + int error = 0; + + if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) { + error = xlog_recover_finish(mp->m_log); + if (!error) + xfs_log_work_queue(mp); + } else { + ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); + } + + + return error; +} + +/* + * Final log writes as part of unmount. + * + * Mark the filesystem clean as unmount happens. Note that during relocation + * this routine needs to be executed as part of source-bag while the + * deallocation must not be done until source-end. + */ + +/* + * Unmount record used to have a string "Unmount filesystem--" in the + * data section where the "Un" was really a magic number (XLOG_UNMOUNT_TYPE). + * We just write the magic number now since that particular field isn't + * currently architecture converted and "Unmount" is a bit foo. + * As far as I know, there weren't any dependencies on the old behaviour. + */ + +int +xfs_log_unmount_write(xfs_mount_t *mp) +{ + struct xlog *log = mp->m_log; + xlog_in_core_t *iclog; +#ifdef DEBUG + xlog_in_core_t *first_iclog; +#endif + xlog_ticket_t *tic = NULL; + xfs_lsn_t lsn; + int error; + + /* + * Don't write out unmount record on read-only mounts. + * Or, if we are doing a forced umount (typically because of IO errors). + */ + if (mp->m_flags & XFS_MOUNT_RDONLY) + return 0; + + error = _xfs_log_force(mp, XFS_LOG_SYNC, NULL); + ASSERT(error || !(XLOG_FORCED_SHUTDOWN(log))); + +#ifdef DEBUG + first_iclog = iclog = log->l_iclog; + do { + if (!(iclog->ic_state & XLOG_STATE_IOERROR)) { + ASSERT(iclog->ic_state & XLOG_STATE_ACTIVE); + ASSERT(iclog->ic_offset == 0); + } + iclog = iclog->ic_next; + } while (iclog != first_iclog); +#endif + if (! (XLOG_FORCED_SHUTDOWN(log))) { + error = xfs_log_reserve(mp, 600, 1, &tic, + XFS_LOG, 0, XLOG_UNMOUNT_REC_TYPE); + if (!error) { + /* the data section must be 32 bit size aligned */ + struct { + __uint16_t magic; + __uint16_t pad1; + __uint32_t pad2; /* may as well make it 64 bits */ + } magic = { + .magic = XLOG_UNMOUNT_TYPE, + }; + struct xfs_log_iovec reg = { + .i_addr = &magic, + .i_len = sizeof(magic), + .i_type = XLOG_REG_TYPE_UNMOUNT, + }; + struct xfs_log_vec vec = { + .lv_niovecs = 1, + .lv_iovecp = ®, + }; + + /* remove inited flag, and account for space used */ + tic->t_flags = 0; + tic->t_curr_res -= sizeof(magic); + error = xlog_write(log, &vec, tic, &lsn, + NULL, XLOG_UNMOUNT_TRANS); + /* + * At this point, we're umounting anyway, + * so there's no point in transitioning log state + * to IOERROR. Just continue... + */ + } + + if (error) + xfs_alert(mp, "%s: unmount record failed", __func__); + + + spin_lock(&log->l_icloglock); + iclog = log->l_iclog; + atomic_inc(&iclog->ic_refcnt); + xlog_state_want_sync(log, iclog); + spin_unlock(&log->l_icloglock); + error = xlog_state_release_iclog(log, iclog); + + spin_lock(&log->l_icloglock); + if (!(iclog->ic_state == XLOG_STATE_ACTIVE || + iclog->ic_state == XLOG_STATE_DIRTY)) { + if (!XLOG_FORCED_SHUTDOWN(log)) { + xlog_wait(&iclog->ic_force_wait, + &log->l_icloglock); + } else { + spin_unlock(&log->l_icloglock); + } + } else { + spin_unlock(&log->l_icloglock); + } + if (tic) { + trace_xfs_log_umount_write(log, tic); + xlog_ungrant_log_space(log, tic); + xfs_log_ticket_put(tic); + } + } else { + /* + * We're already in forced_shutdown mode, couldn't + * even attempt to write out the unmount transaction. + * + * Go through the motions of sync'ing and releasing + * the iclog, even though no I/O will actually happen, + * we need to wait for other log I/Os that may already + * be in progress. Do this as a separate section of + * code so we'll know if we ever get stuck here that + * we're in this odd situation of trying to unmount + * a file system that went into forced_shutdown as + * the result of an unmount.. + */ + spin_lock(&log->l_icloglock); + iclog = log->l_iclog; + atomic_inc(&iclog->ic_refcnt); + + xlog_state_want_sync(log, iclog); + spin_unlock(&log->l_icloglock); + error = xlog_state_release_iclog(log, iclog); + + spin_lock(&log->l_icloglock); + + if ( ! ( iclog->ic_state == XLOG_STATE_ACTIVE + || iclog->ic_state == XLOG_STATE_DIRTY + || iclog->ic_state == XLOG_STATE_IOERROR) ) { + + xlog_wait(&iclog->ic_force_wait, + &log->l_icloglock); + } else { + spin_unlock(&log->l_icloglock); + } + } + + return error; +} /* xfs_log_unmount_write */ + +/* + * Empty the log for unmount/freeze. + * + * To do this, we first need to shut down the background log work so it is not + * trying to cover the log as we clean up. We then need to unpin all objects in + * the log so we can then flush them out. Once they have completed their IO and + * run the callbacks removing themselves from the AIL, we can write the unmount + * record. + */ +void +xfs_log_quiesce( + struct xfs_mount *mp) +{ + cancel_delayed_work_sync(&mp->m_log->l_work); + xfs_log_force(mp, XFS_LOG_SYNC); + + /* + * The superblock buffer is uncached and while xfs_ail_push_all_sync() + * will push it, xfs_wait_buftarg() will not wait for it. Further, + * xfs_buf_iowait() cannot be used because it was pushed with the + * XBF_ASYNC flag set, so we need to use a lock/unlock pair to wait for + * the IO to complete. + */ + xfs_ail_push_all_sync(mp->m_ail); + xfs_wait_buftarg(mp->m_ddev_targp); + xfs_buf_lock(mp->m_sb_bp); + xfs_buf_unlock(mp->m_sb_bp); + + xfs_log_unmount_write(mp); +} + +/* + * Shut down and release the AIL and Log. + * + * During unmount, we need to ensure we flush all the dirty metadata objects + * from the AIL so that the log is empty before we write the unmount record to + * the log. Once this is done, we can tear down the AIL and the log. + */ +void +xfs_log_unmount( + struct xfs_mount *mp) +{ + xfs_log_quiesce(mp); + + xfs_trans_ail_destroy(mp); + + xfs_sysfs_del(&mp->m_log->l_kobj); + + xlog_dealloc_log(mp->m_log); +} + +void +xfs_log_item_init( + struct xfs_mount *mp, + struct xfs_log_item *item, + int type, + const struct xfs_item_ops *ops) +{ + item->li_mountp = mp; + item->li_ailp = mp->m_ail; + item->li_type = type; + item->li_ops = ops; + item->li_lv = NULL; + + INIT_LIST_HEAD(&item->li_ail); + INIT_LIST_HEAD(&item->li_cil); +} + +/* + * Wake up processes waiting for log space after we have moved the log tail. + */ +void +xfs_log_space_wake( + struct xfs_mount *mp) +{ + struct xlog *log = mp->m_log; + int free_bytes; + + if (XLOG_FORCED_SHUTDOWN(log)) + return; + + if (!list_empty_careful(&log->l_write_head.waiters)) { + ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY)); + + spin_lock(&log->l_write_head.lock); + free_bytes = xlog_space_left(log, &log->l_write_head.grant); + xlog_grant_head_wake(log, &log->l_write_head, &free_bytes); + spin_unlock(&log->l_write_head.lock); + } + + if (!list_empty_careful(&log->l_reserve_head.waiters)) { + ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY)); + + spin_lock(&log->l_reserve_head.lock); + free_bytes = xlog_space_left(log, &log->l_reserve_head.grant); + xlog_grant_head_wake(log, &log->l_reserve_head, &free_bytes); + spin_unlock(&log->l_reserve_head.lock); + } +} + +/* + * Determine if we have a transaction that has gone to disk that needs to be + * covered. To begin the transition to the idle state firstly the log needs to + * be idle. That means the CIL, the AIL and the iclogs needs to be empty before + * we start attempting to cover the log. + * + * Only if we are then in a state where covering is needed, the caller is + * informed that dummy transactions are required to move the log into the idle + * state. + * + * If there are any items in the AIl or CIL, then we do not want to attempt to + * cover the log as we may be in a situation where there isn't log space + * available to run a dummy transaction and this can lead to deadlocks when the + * tail of the log is pinned by an item that is modified in the CIL. Hence + * there's no point in running a dummy transaction at this point because we + * can't start trying to idle the log until both the CIL and AIL are empty. + */ +int +xfs_log_need_covered(xfs_mount_t *mp) +{ + struct xlog *log = mp->m_log; + int needed = 0; + + if (!xfs_fs_writable(mp, SB_FREEZE_WRITE)) + return 0; + + if (!xlog_cil_empty(log)) + return 0; + + spin_lock(&log->l_icloglock); + switch (log->l_covered_state) { + case XLOG_STATE_COVER_DONE: + case XLOG_STATE_COVER_DONE2: + case XLOG_STATE_COVER_IDLE: + break; + case XLOG_STATE_COVER_NEED: + case XLOG_STATE_COVER_NEED2: + if (xfs_ail_min_lsn(log->l_ailp)) + break; + if (!xlog_iclogs_empty(log)) + break; + + needed = 1; + if (log->l_covered_state == XLOG_STATE_COVER_NEED) + log->l_covered_state = XLOG_STATE_COVER_DONE; + else + log->l_covered_state = XLOG_STATE_COVER_DONE2; + break; + default: + needed = 1; + break; + } + spin_unlock(&log->l_icloglock); + return needed; +} + +/* + * We may be holding the log iclog lock upon entering this routine. + */ +xfs_lsn_t +xlog_assign_tail_lsn_locked( + struct xfs_mount *mp) +{ + struct xlog *log = mp->m_log; + struct xfs_log_item *lip; + xfs_lsn_t tail_lsn; + + assert_spin_locked(&mp->m_ail->xa_lock); + + /* + * To make sure we always have a valid LSN for the log tail we keep + * track of the last LSN which was committed in log->l_last_sync_lsn, + * and use that when the AIL was empty. + */ + lip = xfs_ail_min(mp->m_ail); + if (lip) + tail_lsn = lip->li_lsn; + else + tail_lsn = atomic64_read(&log->l_last_sync_lsn); + trace_xfs_log_assign_tail_lsn(log, tail_lsn); + atomic64_set(&log->l_tail_lsn, tail_lsn); + return tail_lsn; +} + +xfs_lsn_t +xlog_assign_tail_lsn( + struct xfs_mount *mp) +{ + xfs_lsn_t tail_lsn; + + spin_lock(&mp->m_ail->xa_lock); + tail_lsn = xlog_assign_tail_lsn_locked(mp); + spin_unlock(&mp->m_ail->xa_lock); + + return tail_lsn; +} + +/* + * Return the space in the log between the tail and the head. The head + * is passed in the cycle/bytes formal parms. In the special case where + * the reserve head has wrapped passed the tail, this calculation is no + * longer valid. In this case, just return 0 which means there is no space + * in the log. This works for all places where this function is called + * with the reserve head. Of course, if the write head were to ever + * wrap the tail, we should blow up. Rather than catch this case here, + * we depend on other ASSERTions in other parts of the code. XXXmiken + * + * This code also handles the case where the reservation head is behind + * the tail. The details of this case are described below, but the end + * result is that we return the size of the log as the amount of space left. + */ +STATIC int +xlog_space_left( + struct xlog *log, + atomic64_t *head) +{ + int free_bytes; + int tail_bytes; + int tail_cycle; + int head_cycle; + int head_bytes; + + xlog_crack_grant_head(head, &head_cycle, &head_bytes); + xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_bytes); + tail_bytes = BBTOB(tail_bytes); + if (tail_cycle == head_cycle && head_bytes >= tail_bytes) + free_bytes = log->l_logsize - (head_bytes - tail_bytes); + else if (tail_cycle + 1 < head_cycle) + return 0; + else if (tail_cycle < head_cycle) { + ASSERT(tail_cycle == (head_cycle - 1)); + free_bytes = tail_bytes - head_bytes; + } else { + /* + * The reservation head is behind the tail. + * In this case we just want to return the size of the + * log as the amount of space left. + */ + xfs_alert(log->l_mp, + "xlog_space_left: head behind tail\n" + " tail_cycle = %d, tail_bytes = %d\n" + " GH cycle = %d, GH bytes = %d", + tail_cycle, tail_bytes, head_cycle, head_bytes); + ASSERT(0); + free_bytes = log->l_logsize; + } + return free_bytes; +} + + +/* + * Log function which is called when an io completes. + * + * The log manager needs its own routine, in order to control what + * happens with the buffer after the write completes. + */ +void +xlog_iodone(xfs_buf_t *bp) +{ + struct xlog_in_core *iclog = bp->b_fspriv; + struct xlog *l = iclog->ic_log; + int aborted = 0; + + /* + * Race to shutdown the filesystem if we see an error. + */ + if (XFS_TEST_ERROR(bp->b_error, l->l_mp, + XFS_ERRTAG_IODONE_IOERR, XFS_RANDOM_IODONE_IOERR)) { + xfs_buf_ioerror_alert(bp, __func__); + xfs_buf_stale(bp); + xfs_force_shutdown(l->l_mp, SHUTDOWN_LOG_IO_ERROR); + /* + * This flag will be propagated to the trans-committed + * callback routines to let them know that the log-commit + * didn't succeed. + */ + aborted = XFS_LI_ABORTED; + } else if (iclog->ic_state & XLOG_STATE_IOERROR) { + aborted = XFS_LI_ABORTED; + } + + /* log I/O is always issued ASYNC */ + ASSERT(XFS_BUF_ISASYNC(bp)); + xlog_state_done_syncing(iclog, aborted); + + /* + * drop the buffer lock now that we are done. Nothing references + * the buffer after this, so an unmount waiting on this lock can now + * tear it down safely. As such, it is unsafe to reference the buffer + * (bp) after the unlock as we could race with it being freed. + */ + xfs_buf_unlock(bp); +} + +/* + * Return size of each in-core log record buffer. + * + * All machines get 8 x 32kB buffers by default, unless tuned otherwise. + * + * If the filesystem blocksize is too large, we may need to choose a + * larger size since the directory code currently logs entire blocks. + */ + +STATIC void +xlog_get_iclog_buffer_size( + struct xfs_mount *mp, + struct xlog *log) +{ + int size; + int xhdrs; + + if (mp->m_logbufs <= 0) + log->l_iclog_bufs = XLOG_MAX_ICLOGS; + else + log->l_iclog_bufs = mp->m_logbufs; + + /* + * Buffer size passed in from mount system call. + */ + if (mp->m_logbsize > 0) { + size = log->l_iclog_size = mp->m_logbsize; + log->l_iclog_size_log = 0; + while (size != 1) { + log->l_iclog_size_log++; + size >>= 1; + } + + if (xfs_sb_version_haslogv2(&mp->m_sb)) { + /* # headers = size / 32k + * one header holds cycles from 32k of data + */ + + xhdrs = mp->m_logbsize / XLOG_HEADER_CYCLE_SIZE; + if (mp->m_logbsize % XLOG_HEADER_CYCLE_SIZE) + xhdrs++; + log->l_iclog_hsize = xhdrs << BBSHIFT; + log->l_iclog_heads = xhdrs; + } else { + ASSERT(mp->m_logbsize <= XLOG_BIG_RECORD_BSIZE); + log->l_iclog_hsize = BBSIZE; + log->l_iclog_heads = 1; + } + goto done; + } + + /* All machines use 32kB buffers by default. */ + log->l_iclog_size = XLOG_BIG_RECORD_BSIZE; + log->l_iclog_size_log = XLOG_BIG_RECORD_BSHIFT; + + /* the default log size is 16k or 32k which is one header sector */ + log->l_iclog_hsize = BBSIZE; + log->l_iclog_heads = 1; + +done: + /* are we being asked to make the sizes selected above visible? */ + if (mp->m_logbufs == 0) + mp->m_logbufs = log->l_iclog_bufs; + if (mp->m_logbsize == 0) + mp->m_logbsize = log->l_iclog_size; +} /* xlog_get_iclog_buffer_size */ + + +void +xfs_log_work_queue( + struct xfs_mount *mp) +{ + queue_delayed_work(mp->m_log_workqueue, &mp->m_log->l_work, + msecs_to_jiffies(xfs_syncd_centisecs * 10)); +} + +/* + * Every sync period we need to unpin all items in the AIL and push them to + * disk. If there is nothing dirty, then we might need to cover the log to + * indicate that the filesystem is idle. + */ +void +xfs_log_worker( + struct work_struct *work) +{ + struct xlog *log = container_of(to_delayed_work(work), + struct xlog, l_work); + struct xfs_mount *mp = log->l_mp; + + /* dgc: errors ignored - not fatal and nowhere to report them */ + if (xfs_log_need_covered(mp)) { + /* + * Dump a transaction into the log that contains no real change. + * This is needed to stamp the current tail LSN into the log + * during the covering operation. + * + * We cannot use an inode here for this - that will push dirty + * state back up into the VFS and then periodic inode flushing + * will prevent log covering from making progress. Hence we + * synchronously log the superblock instead to ensure the + * superblock is immediately unpinned and can be written back. + */ + xfs_sync_sb(mp, true); + } else + xfs_log_force(mp, 0); + + /* start pushing all the metadata that is currently dirty */ + xfs_ail_push_all(mp->m_ail); + + /* queue us up again */ + xfs_log_work_queue(mp); +} + +/* + * This routine initializes some of the log structure for a given mount point. + * Its primary purpose is to fill in enough, so recovery can occur. However, + * some other stuff may be filled in too. + */ +STATIC struct xlog * +xlog_alloc_log( + struct xfs_mount *mp, + struct xfs_buftarg *log_target, + xfs_daddr_t blk_offset, + int num_bblks) +{ + struct xlog *log; + xlog_rec_header_t *head; + xlog_in_core_t **iclogp; + xlog_in_core_t *iclog, *prev_iclog=NULL; + xfs_buf_t *bp; + int i; + int error = -ENOMEM; + uint log2_size = 0; + + log = kmem_zalloc(sizeof(struct xlog), KM_MAYFAIL); + if (!log) { + xfs_warn(mp, "Log allocation failed: No memory!"); + goto out; + } + + log->l_mp = mp; + log->l_targ = log_target; + log->l_logsize = BBTOB(num_bblks); + log->l_logBBstart = blk_offset; + log->l_logBBsize = num_bblks; + log->l_covered_state = XLOG_STATE_COVER_IDLE; + log->l_flags |= XLOG_ACTIVE_RECOVERY; + INIT_DELAYED_WORK(&log->l_work, xfs_log_worker); + + log->l_prev_block = -1; + /* log->l_tail_lsn = 0x100000000LL; cycle = 1; current block = 0 */ + xlog_assign_atomic_lsn(&log->l_tail_lsn, 1, 0); + xlog_assign_atomic_lsn(&log->l_last_sync_lsn, 1, 0); + log->l_curr_cycle = 1; /* 0 is bad since this is initial value */ + + xlog_grant_head_init(&log->l_reserve_head); + xlog_grant_head_init(&log->l_write_head); + + error = -EFSCORRUPTED; + if (xfs_sb_version_hassector(&mp->m_sb)) { + log2_size = mp->m_sb.sb_logsectlog; + if (log2_size < BBSHIFT) { + xfs_warn(mp, "Log sector size too small (0x%x < 0x%x)", + log2_size, BBSHIFT); + goto out_free_log; + } + + log2_size -= BBSHIFT; + if (log2_size > mp->m_sectbb_log) { + xfs_warn(mp, "Log sector size too large (0x%x > 0x%x)", + log2_size, mp->m_sectbb_log); + goto out_free_log; + } + + /* for larger sector sizes, must have v2 or external log */ + if (log2_size && log->l_logBBstart > 0 && + !xfs_sb_version_haslogv2(&mp->m_sb)) { + xfs_warn(mp, + "log sector size (0x%x) invalid for configuration.", + log2_size); + goto out_free_log; + } + } + log->l_sectBBsize = 1 << log2_size; + + xlog_get_iclog_buffer_size(mp, log); + + /* + * Use a NULL block for the extra log buffer used during splits so that + * it will trigger errors if we ever try to do IO on it without first + * having set it up properly. + */ + error = -ENOMEM; + bp = xfs_buf_alloc(mp->m_logdev_targp, XFS_BUF_DADDR_NULL, + BTOBB(log->l_iclog_size), 0); + if (!bp) + goto out_free_log; + + /* + * The iclogbuf buffer locks are held over IO but we are not going to do + * IO yet. Hence unlock the buffer so that the log IO path can grab it + * when appropriately. + */ + ASSERT(xfs_buf_islocked(bp)); + xfs_buf_unlock(bp); + + /* use high priority wq for log I/O completion */ + bp->b_ioend_wq = mp->m_log_workqueue; + bp->b_iodone = xlog_iodone; + log->l_xbuf = bp; + + spin_lock_init(&log->l_icloglock); + init_waitqueue_head(&log->l_flush_wait); + + iclogp = &log->l_iclog; + /* + * The amount of memory to allocate for the iclog structure is + * rather funky due to the way the structure is defined. It is + * done this way so that we can use different sizes for machines + * with different amounts of memory. See the definition of + * xlog_in_core_t in xfs_log_priv.h for details. + */ + ASSERT(log->l_iclog_size >= 4096); + for (i=0; i < log->l_iclog_bufs; i++) { + *iclogp = kmem_zalloc(sizeof(xlog_in_core_t), KM_MAYFAIL); + if (!*iclogp) + goto out_free_iclog; + + iclog = *iclogp; + iclog->ic_prev = prev_iclog; + prev_iclog = iclog; + + bp = xfs_buf_get_uncached(mp->m_logdev_targp, + BTOBB(log->l_iclog_size), 0); + if (!bp) + goto out_free_iclog; + + ASSERT(xfs_buf_islocked(bp)); + xfs_buf_unlock(bp); + + /* use high priority wq for log I/O completion */ + bp->b_ioend_wq = mp->m_log_workqueue; + bp->b_iodone = xlog_iodone; + iclog->ic_bp = bp; + iclog->ic_data = bp->b_addr; +#ifdef DEBUG + log->l_iclog_bak[i] = (xfs_caddr_t)&(iclog->ic_header); +#endif + head = &iclog->ic_header; + memset(head, 0, sizeof(xlog_rec_header_t)); + head->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM); + head->h_version = cpu_to_be32( + xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? 2 : 1); + head->h_size = cpu_to_be32(log->l_iclog_size); + /* new fields */ + head->h_fmt = cpu_to_be32(XLOG_FMT); + memcpy(&head->h_fs_uuid, &mp->m_sb.sb_uuid, sizeof(uuid_t)); + + iclog->ic_size = BBTOB(bp->b_length) - log->l_iclog_hsize; + iclog->ic_state = XLOG_STATE_ACTIVE; + iclog->ic_log = log; + atomic_set(&iclog->ic_refcnt, 0); + spin_lock_init(&iclog->ic_callback_lock); + iclog->ic_callback_tail = &(iclog->ic_callback); + iclog->ic_datap = (char *)iclog->ic_data + log->l_iclog_hsize; + + init_waitqueue_head(&iclog->ic_force_wait); + init_waitqueue_head(&iclog->ic_write_wait); + + iclogp = &iclog->ic_next; + } + *iclogp = log->l_iclog; /* complete ring */ + log->l_iclog->ic_prev = prev_iclog; /* re-write 1st prev ptr */ + + error = xlog_cil_init(log); + if (error) + goto out_free_iclog; + return log; + +out_free_iclog: + for (iclog = log->l_iclog; iclog; iclog = prev_iclog) { + prev_iclog = iclog->ic_next; + if (iclog->ic_bp) + xfs_buf_free(iclog->ic_bp); + kmem_free(iclog); + } + spinlock_destroy(&log->l_icloglock); + xfs_buf_free(log->l_xbuf); +out_free_log: + kmem_free(log); +out: + return ERR_PTR(error); +} /* xlog_alloc_log */ + + +/* + * Write out the commit record of a transaction associated with the given + * ticket. Return the lsn of the commit record. + */ +STATIC int +xlog_commit_record( + struct xlog *log, + struct xlog_ticket *ticket, + struct xlog_in_core **iclog, + xfs_lsn_t *commitlsnp) +{ + struct xfs_mount *mp = log->l_mp; + int error; + struct xfs_log_iovec reg = { + .i_addr = NULL, + .i_len = 0, + .i_type = XLOG_REG_TYPE_COMMIT, + }; + struct xfs_log_vec vec = { + .lv_niovecs = 1, + .lv_iovecp = ®, + }; + + ASSERT_ALWAYS(iclog); + error = xlog_write(log, &vec, ticket, commitlsnp, iclog, + XLOG_COMMIT_TRANS); + if (error) + xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); + return error; +} + +/* + * Push on the buffer cache code if we ever use more than 75% of the on-disk + * log space. This code pushes on the lsn which would supposedly free up + * the 25% which we want to leave free. We may need to adopt a policy which + * pushes on an lsn which is further along in the log once we reach the high + * water mark. In this manner, we would be creating a low water mark. + */ +STATIC void +xlog_grant_push_ail( + struct xlog *log, + int need_bytes) +{ + xfs_lsn_t threshold_lsn = 0; + xfs_lsn_t last_sync_lsn; + int free_blocks; + int free_bytes; + int threshold_block; + int threshold_cycle; + int free_threshold; + + ASSERT(BTOBB(need_bytes) < log->l_logBBsize); + + free_bytes = xlog_space_left(log, &log->l_reserve_head.grant); + free_blocks = BTOBBT(free_bytes); + + /* + * Set the threshold for the minimum number of free blocks in the + * log to the maximum of what the caller needs, one quarter of the + * log, and 256 blocks. + */ + free_threshold = BTOBB(need_bytes); + free_threshold = MAX(free_threshold, (log->l_logBBsize >> 2)); + free_threshold = MAX(free_threshold, 256); + if (free_blocks >= free_threshold) + return; + + xlog_crack_atomic_lsn(&log->l_tail_lsn, &threshold_cycle, + &threshold_block); + threshold_block += free_threshold; + if (threshold_block >= log->l_logBBsize) { + threshold_block -= log->l_logBBsize; + threshold_cycle += 1; + } + threshold_lsn = xlog_assign_lsn(threshold_cycle, + threshold_block); + /* + * Don't pass in an lsn greater than the lsn of the last + * log record known to be on disk. Use a snapshot of the last sync lsn + * so that it doesn't change between the compare and the set. + */ + last_sync_lsn = atomic64_read(&log->l_last_sync_lsn); + if (XFS_LSN_CMP(threshold_lsn, last_sync_lsn) > 0) + threshold_lsn = last_sync_lsn; + + /* + * Get the transaction layer to kick the dirty buffers out to + * disk asynchronously. No point in trying to do this if + * the filesystem is shutting down. + */ + if (!XLOG_FORCED_SHUTDOWN(log)) + xfs_ail_push(log->l_ailp, threshold_lsn); +} + +/* + * Stamp cycle number in every block + */ +STATIC void +xlog_pack_data( + struct xlog *log, + struct xlog_in_core *iclog, + int roundoff) +{ + int i, j, k; + int size = iclog->ic_offset + roundoff; + __be32 cycle_lsn; + xfs_caddr_t dp; + + cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn); + + dp = iclog->ic_datap; + for (i = 0; i < BTOBB(size); i++) { + if (i >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) + break; + iclog->ic_header.h_cycle_data[i] = *(__be32 *)dp; + *(__be32 *)dp = cycle_lsn; + dp += BBSIZE; + } + + if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { + xlog_in_core_2_t *xhdr = iclog->ic_data; + + for ( ; i < BTOBB(size); i++) { + j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); + k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); + xhdr[j].hic_xheader.xh_cycle_data[k] = *(__be32 *)dp; + *(__be32 *)dp = cycle_lsn; + dp += BBSIZE; + } + + for (i = 1; i < log->l_iclog_heads; i++) + xhdr[i].hic_xheader.xh_cycle = cycle_lsn; + } +} + +/* + * Calculate the checksum for a log buffer. + * + * This is a little more complicated than it should be because the various + * headers and the actual data are non-contiguous. + */ +__le32 +xlog_cksum( + struct xlog *log, + struct xlog_rec_header *rhead, + char *dp, + int size) +{ + __uint32_t crc; + + /* first generate the crc for the record header ... */ + crc = xfs_start_cksum((char *)rhead, + sizeof(struct xlog_rec_header), + offsetof(struct xlog_rec_header, h_crc)); + + /* ... then for additional cycle data for v2 logs ... */ + if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { + union xlog_in_core2 *xhdr = (union xlog_in_core2 *)rhead; + int i; + + for (i = 1; i < log->l_iclog_heads; i++) { + crc = crc32c(crc, &xhdr[i].hic_xheader, + sizeof(struct xlog_rec_ext_header)); + } + } + + /* ... and finally for the payload */ + crc = crc32c(crc, dp, size); + + return xfs_end_cksum(crc); +} + +/* + * The bdstrat callback function for log bufs. This gives us a central + * place to trap bufs in case we get hit by a log I/O error and need to + * shutdown. Actually, in practice, even when we didn't get a log error, + * we transition the iclogs to IOERROR state *after* flushing all existing + * iclogs to disk. This is because we don't want anymore new transactions to be + * started or completed afterwards. + * + * We lock the iclogbufs here so that we can serialise against IO completion + * during unmount. We might be processing a shutdown triggered during unmount, + * and that can occur asynchronously to the unmount thread, and hence we need to + * ensure that completes before tearing down the iclogbufs. Hence we need to + * hold the buffer lock across the log IO to acheive that. + */ +STATIC int +xlog_bdstrat( + struct xfs_buf *bp) +{ + struct xlog_in_core *iclog = bp->b_fspriv; + + xfs_buf_lock(bp); + if (iclog->ic_state & XLOG_STATE_IOERROR) { + xfs_buf_ioerror(bp, -EIO); + xfs_buf_stale(bp); + xfs_buf_ioend(bp); + /* + * It would seem logical to return EIO here, but we rely on + * the log state machine to propagate I/O errors instead of + * doing it here. Similarly, IO completion will unlock the + * buffer, so we don't do it here. + */ + return 0; + } + + xfs_buf_submit(bp); + return 0; +} + +/* + * Flush out the in-core log (iclog) to the on-disk log in an asynchronous + * fashion. Previously, we should have moved the current iclog + * ptr in the log to point to the next available iclog. This allows further + * write to continue while this code syncs out an iclog ready to go. + * Before an in-core log can be written out, the data section must be scanned + * to save away the 1st word of each BBSIZE block into the header. We replace + * it with the current cycle count. Each BBSIZE block is tagged with the + * cycle count because there in an implicit assumption that drives will + * guarantee that entire 512 byte blocks get written at once. In other words, + * we can't have part of a 512 byte block written and part not written. By + * tagging each block, we will know which blocks are valid when recovering + * after an unclean shutdown. + * + * This routine is single threaded on the iclog. No other thread can be in + * this routine with the same iclog. Changing contents of iclog can there- + * fore be done without grabbing the state machine lock. Updating the global + * log will require grabbing the lock though. + * + * The entire log manager uses a logical block numbering scheme. Only + * log_sync (and then only bwrite()) know about the fact that the log may + * not start with block zero on a given device. The log block start offset + * is added immediately before calling bwrite(). + */ + +STATIC int +xlog_sync( + struct xlog *log, + struct xlog_in_core *iclog) +{ + xfs_buf_t *bp; + int i; + uint count; /* byte count of bwrite */ + uint count_init; /* initial count before roundup */ + int roundoff; /* roundoff to BB or stripe */ + int split = 0; /* split write into two regions */ + int error; + int v2 = xfs_sb_version_haslogv2(&log->l_mp->m_sb); + int size; + + XFS_STATS_INC(xs_log_writes); + ASSERT(atomic_read(&iclog->ic_refcnt) == 0); + + /* Add for LR header */ + count_init = log->l_iclog_hsize + iclog->ic_offset; + + /* Round out the log write size */ + if (v2 && log->l_mp->m_sb.sb_logsunit > 1) { + /* we have a v2 stripe unit to use */ + count = XLOG_LSUNITTOB(log, XLOG_BTOLSUNIT(log, count_init)); + } else { + count = BBTOB(BTOBB(count_init)); + } + roundoff = count - count_init; + ASSERT(roundoff >= 0); + ASSERT((v2 && log->l_mp->m_sb.sb_logsunit > 1 && + roundoff < log->l_mp->m_sb.sb_logsunit) + || + (log->l_mp->m_sb.sb_logsunit <= 1 && + roundoff < BBTOB(1))); + + /* move grant heads by roundoff in sync */ + xlog_grant_add_space(log, &log->l_reserve_head.grant, roundoff); + xlog_grant_add_space(log, &log->l_write_head.grant, roundoff); + + /* put cycle number in every block */ + xlog_pack_data(log, iclog, roundoff); + + /* real byte length */ + size = iclog->ic_offset; + if (v2) + size += roundoff; + iclog->ic_header.h_len = cpu_to_be32(size); + + bp = iclog->ic_bp; + XFS_BUF_SET_ADDR(bp, BLOCK_LSN(be64_to_cpu(iclog->ic_header.h_lsn))); + + XFS_STATS_ADD(xs_log_blocks, BTOBB(count)); + + /* Do we need to split this write into 2 parts? */ + if (XFS_BUF_ADDR(bp) + BTOBB(count) > log->l_logBBsize) { + char *dptr; + + split = count - (BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp))); + count = BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp)); + iclog->ic_bwritecnt = 2; + + /* + * Bump the cycle numbers at the start of each block in the + * part of the iclog that ends up in the buffer that gets + * written to the start of the log. + * + * Watch out for the header magic number case, though. + */ + dptr = (char *)&iclog->ic_header + count; + for (i = 0; i < split; i += BBSIZE) { + __uint32_t cycle = be32_to_cpu(*(__be32 *)dptr); + if (++cycle == XLOG_HEADER_MAGIC_NUM) + cycle++; + *(__be32 *)dptr = cpu_to_be32(cycle); + + dptr += BBSIZE; + } + } else { + iclog->ic_bwritecnt = 1; + } + + /* calculcate the checksum */ + iclog->ic_header.h_crc = xlog_cksum(log, &iclog->ic_header, + iclog->ic_datap, size); + + bp->b_io_length = BTOBB(count); + bp->b_fspriv = iclog; + XFS_BUF_ZEROFLAGS(bp); + XFS_BUF_ASYNC(bp); + bp->b_flags |= XBF_SYNCIO; + + if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) { + bp->b_flags |= XBF_FUA; + + /* + * Flush the data device before flushing the log to make + * sure all meta data written back from the AIL actually made + * it to disk before stamping the new log tail LSN into the + * log buffer. For an external log we need to issue the + * flush explicitly, and unfortunately synchronously here; + * for an internal log we can simply use the block layer + * state machine for preflushes. + */ + if (log->l_mp->m_logdev_targp != log->l_mp->m_ddev_targp) + xfs_blkdev_issue_flush(log->l_mp->m_ddev_targp); + else + bp->b_flags |= XBF_FLUSH; + } + + ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1); + ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize); + + xlog_verify_iclog(log, iclog, count, true); + + /* account for log which doesn't start at block #0 */ + XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart); + /* + * Don't call xfs_bwrite here. We do log-syncs even when the filesystem + * is shutting down. + */ + XFS_BUF_WRITE(bp); + + error = xlog_bdstrat(bp); + if (error) { + xfs_buf_ioerror_alert(bp, "xlog_sync"); + return error; + } + if (split) { + bp = iclog->ic_log->l_xbuf; + XFS_BUF_SET_ADDR(bp, 0); /* logical 0 */ + xfs_buf_associate_memory(bp, + (char *)&iclog->ic_header + count, split); + bp->b_fspriv = iclog; + XFS_BUF_ZEROFLAGS(bp); + XFS_BUF_ASYNC(bp); + bp->b_flags |= XBF_SYNCIO; + if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) + bp->b_flags |= XBF_FUA; + + ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1); + ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize); + + /* account for internal log which doesn't start at block #0 */ + XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart); + XFS_BUF_WRITE(bp); + error = xlog_bdstrat(bp); + if (error) { + xfs_buf_ioerror_alert(bp, "xlog_sync (split)"); + return error; + } + } + return 0; +} /* xlog_sync */ + +/* + * Deallocate a log structure + */ +STATIC void +xlog_dealloc_log( + struct xlog *log) +{ + xlog_in_core_t *iclog, *next_iclog; + int i; + + xlog_cil_destroy(log); + + /* + * Cycle all the iclogbuf locks to make sure all log IO completion + * is done before we tear down these buffers. + */ + iclog = log->l_iclog; + for (i = 0; i < log->l_iclog_bufs; i++) { + xfs_buf_lock(iclog->ic_bp); + xfs_buf_unlock(iclog->ic_bp); + iclog = iclog->ic_next; + } + + /* + * Always need to ensure that the extra buffer does not point to memory + * owned by another log buffer before we free it. Also, cycle the lock + * first to ensure we've completed IO on it. + */ + xfs_buf_lock(log->l_xbuf); + xfs_buf_unlock(log->l_xbuf); + xfs_buf_set_empty(log->l_xbuf, BTOBB(log->l_iclog_size)); + xfs_buf_free(log->l_xbuf); + + iclog = log->l_iclog; + for (i = 0; i < log->l_iclog_bufs; i++) { + xfs_buf_free(iclog->ic_bp); + next_iclog = iclog->ic_next; + kmem_free(iclog); + iclog = next_iclog; + } + spinlock_destroy(&log->l_icloglock); + + log->l_mp->m_log = NULL; + kmem_free(log); +} /* xlog_dealloc_log */ + +/* + * Update counters atomically now that memcpy is done. + */ +/* ARGSUSED */ +static inline void +xlog_state_finish_copy( + struct xlog *log, + struct xlog_in_core *iclog, + int record_cnt, + int copy_bytes) +{ + spin_lock(&log->l_icloglock); + + be32_add_cpu(&iclog->ic_header.h_num_logops, record_cnt); + iclog->ic_offset += copy_bytes; + + spin_unlock(&log->l_icloglock); +} /* xlog_state_finish_copy */ + + + + +/* + * print out info relating to regions written which consume + * the reservation + */ +void +xlog_print_tic_res( + struct xfs_mount *mp, + struct xlog_ticket *ticket) +{ + uint i; + uint ophdr_spc = ticket->t_res_num_ophdrs * (uint)sizeof(xlog_op_header_t); + + /* match with XLOG_REG_TYPE_* in xfs_log.h */ + static char *res_type_str[XLOG_REG_TYPE_MAX] = { + "bformat", + "bchunk", + "efi_format", + "efd_format", + "iformat", + "icore", + "iext", + "ibroot", + "ilocal", + "iattr_ext", + "iattr_broot", + "iattr_local", + "qformat", + "dquot", + "quotaoff", + "LR header", + "unmount", + "commit", + "trans header" + }; + static char *trans_type_str[XFS_TRANS_TYPE_MAX] = { + "SETATTR_NOT_SIZE", + "SETATTR_SIZE", + "INACTIVE", + "CREATE", + "CREATE_TRUNC", + "TRUNCATE_FILE", + "REMOVE", + "LINK", + "RENAME", + "MKDIR", + "RMDIR", + "SYMLINK", + "SET_DMATTRS", + "GROWFS", + "STRAT_WRITE", + "DIOSTRAT", + "WRITE_SYNC", + "WRITEID", + "ADDAFORK", + "ATTRINVAL", + "ATRUNCATE", + "ATTR_SET", + "ATTR_RM", + "ATTR_FLAG", + "CLEAR_AGI_BUCKET", + "QM_SBCHANGE", + "DUMMY1", + "DUMMY2", + "QM_QUOTAOFF", + "QM_DQALLOC", + "QM_SETQLIM", + "QM_DQCLUSTER", + "QM_QINOCREATE", + "QM_QUOTAOFF_END", + "SB_UNIT", + "FSYNC_TS", + "GROWFSRT_ALLOC", + "GROWFSRT_ZERO", + "GROWFSRT_FREE", + "SWAPEXT" + }; + + xfs_warn(mp, + "xlog_write: reservation summary:\n" + " trans type = %s (%u)\n" + " unit res = %d bytes\n" + " current res = %d bytes\n" + " total reg = %u bytes (o/flow = %u bytes)\n" + " ophdrs = %u (ophdr space = %u bytes)\n" + " ophdr + reg = %u bytes\n" + " num regions = %u", + ((ticket->t_trans_type <= 0 || + ticket->t_trans_type > XFS_TRANS_TYPE_MAX) ? + "bad-trans-type" : trans_type_str[ticket->t_trans_type-1]), + ticket->t_trans_type, + ticket->t_unit_res, + ticket->t_curr_res, + ticket->t_res_arr_sum, ticket->t_res_o_flow, + ticket->t_res_num_ophdrs, ophdr_spc, + ticket->t_res_arr_sum + + ticket->t_res_o_flow + ophdr_spc, + ticket->t_res_num); + + for (i = 0; i < ticket->t_res_num; i++) { + uint r_type = ticket->t_res_arr[i].r_type; + xfs_warn(mp, "region[%u]: %s - %u bytes", i, + ((r_type <= 0 || r_type > XLOG_REG_TYPE_MAX) ? + "bad-rtype" : res_type_str[r_type-1]), + ticket->t_res_arr[i].r_len); + } + + xfs_alert_tag(mp, XFS_PTAG_LOGRES, + "xlog_write: reservation ran out. Need to up reservation"); + xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); +} + +/* + * Calculate the potential space needed by the log vector. Each region gets + * its own xlog_op_header_t and may need to be double word aligned. + */ +static int +xlog_write_calc_vec_length( + struct xlog_ticket *ticket, + struct xfs_log_vec *log_vector) +{ + struct xfs_log_vec *lv; + int headers = 0; + int len = 0; + int i; + + /* acct for start rec of xact */ + if (ticket->t_flags & XLOG_TIC_INITED) + headers++; + + for (lv = log_vector; lv; lv = lv->lv_next) { + /* we don't write ordered log vectors */ + if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) + continue; + + headers += lv->lv_niovecs; + + for (i = 0; i < lv->lv_niovecs; i++) { + struct xfs_log_iovec *vecp = &lv->lv_iovecp[i]; + + len += vecp->i_len; + xlog_tic_add_region(ticket, vecp->i_len, vecp->i_type); + } + } + + ticket->t_res_num_ophdrs += headers; + len += headers * sizeof(struct xlog_op_header); + + return len; +} + +/* + * If first write for transaction, insert start record We can't be trying to + * commit if we are inited. We can't have any "partial_copy" if we are inited. + */ +static int +xlog_write_start_rec( + struct xlog_op_header *ophdr, + struct xlog_ticket *ticket) +{ + if (!(ticket->t_flags & XLOG_TIC_INITED)) + return 0; + + ophdr->oh_tid = cpu_to_be32(ticket->t_tid); + ophdr->oh_clientid = ticket->t_clientid; + ophdr->oh_len = 0; + ophdr->oh_flags = XLOG_START_TRANS; + ophdr->oh_res2 = 0; + + ticket->t_flags &= ~XLOG_TIC_INITED; + + return sizeof(struct xlog_op_header); +} + +static xlog_op_header_t * +xlog_write_setup_ophdr( + struct xlog *log, + struct xlog_op_header *ophdr, + struct xlog_ticket *ticket, + uint flags) +{ + ophdr->oh_tid = cpu_to_be32(ticket->t_tid); + ophdr->oh_clientid = ticket->t_clientid; + ophdr->oh_res2 = 0; + + /* are we copying a commit or unmount record? */ + ophdr->oh_flags = flags; + + /* + * We've seen logs corrupted with bad transaction client ids. This + * makes sure that XFS doesn't generate them on. Turn this into an EIO + * and shut down the filesystem. + */ + switch (ophdr->oh_clientid) { + case XFS_TRANSACTION: + case XFS_VOLUME: + case XFS_LOG: + break; + default: + xfs_warn(log->l_mp, + "Bad XFS transaction clientid 0x%x in ticket 0x%p", + ophdr->oh_clientid, ticket); + return NULL; + } + + return ophdr; +} + +/* + * Set up the parameters of the region copy into the log. This has + * to handle region write split across multiple log buffers - this + * state is kept external to this function so that this code can + * be written in an obvious, self documenting manner. + */ +static int +xlog_write_setup_copy( + struct xlog_ticket *ticket, + struct xlog_op_header *ophdr, + int space_available, + int space_required, + int *copy_off, + int *copy_len, + int *last_was_partial_copy, + int *bytes_consumed) +{ + int still_to_copy; + + still_to_copy = space_required - *bytes_consumed; + *copy_off = *bytes_consumed; + + if (still_to_copy <= space_available) { + /* write of region completes here */ + *copy_len = still_to_copy; + ophdr->oh_len = cpu_to_be32(*copy_len); + if (*last_was_partial_copy) + ophdr->oh_flags |= (XLOG_END_TRANS|XLOG_WAS_CONT_TRANS); + *last_was_partial_copy = 0; + *bytes_consumed = 0; + return 0; + } + + /* partial write of region, needs extra log op header reservation */ + *copy_len = space_available; + ophdr->oh_len = cpu_to_be32(*copy_len); + ophdr->oh_flags |= XLOG_CONTINUE_TRANS; + if (*last_was_partial_copy) + ophdr->oh_flags |= XLOG_WAS_CONT_TRANS; + *bytes_consumed += *copy_len; + (*last_was_partial_copy)++; + + /* account for new log op header */ + ticket->t_curr_res -= sizeof(struct xlog_op_header); + ticket->t_res_num_ophdrs++; + + return sizeof(struct xlog_op_header); +} + +static int +xlog_write_copy_finish( + struct xlog *log, + struct xlog_in_core *iclog, + uint flags, + int *record_cnt, + int *data_cnt, + int *partial_copy, + int *partial_copy_len, + int log_offset, + struct xlog_in_core **commit_iclog) +{ + if (*partial_copy) { + /* + * This iclog has already been marked WANT_SYNC by + * xlog_state_get_iclog_space. + */ + xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt); + *record_cnt = 0; + *data_cnt = 0; + return xlog_state_release_iclog(log, iclog); + } + + *partial_copy = 0; + *partial_copy_len = 0; + + if (iclog->ic_size - log_offset <= sizeof(xlog_op_header_t)) { + /* no more space in this iclog - push it. */ + xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt); + *record_cnt = 0; + *data_cnt = 0; + + spin_lock(&log->l_icloglock); + xlog_state_want_sync(log, iclog); + spin_unlock(&log->l_icloglock); + + if (!commit_iclog) + return xlog_state_release_iclog(log, iclog); + ASSERT(flags & XLOG_COMMIT_TRANS); + *commit_iclog = iclog; + } + + return 0; +} + +/* + * Write some region out to in-core log + * + * This will be called when writing externally provided regions or when + * writing out a commit record for a given transaction. + * + * General algorithm: + * 1. Find total length of this write. This may include adding to the + * lengths passed in. + * 2. Check whether we violate the tickets reservation. + * 3. While writing to this iclog + * A. Reserve as much space in this iclog as can get + * B. If this is first write, save away start lsn + * C. While writing this region: + * 1. If first write of transaction, write start record + * 2. Write log operation header (header per region) + * 3. Find out if we can fit entire region into this iclog + * 4. Potentially, verify destination memcpy ptr + * 5. Memcpy (partial) region + * 6. If partial copy, release iclog; otherwise, continue + * copying more regions into current iclog + * 4. Mark want sync bit (in simulation mode) + * 5. Release iclog for potential flush to on-disk log. + * + * ERRORS: + * 1. Panic if reservation is overrun. This should never happen since + * reservation amounts are generated internal to the filesystem. + * NOTES: + * 1. Tickets are single threaded data structures. + * 2. The XLOG_END_TRANS & XLOG_CONTINUE_TRANS flags are passed down to the + * syncing routine. When a single log_write region needs to span + * multiple in-core logs, the XLOG_CONTINUE_TRANS bit should be set + * on all log operation writes which don't contain the end of the + * region. The XLOG_END_TRANS bit is used for the in-core log + * operation which contains the end of the continued log_write region. + * 3. When xlog_state_get_iclog_space() grabs the rest of the current iclog, + * we don't really know exactly how much space will be used. As a result, + * we don't update ic_offset until the end when we know exactly how many + * bytes have been written out. + */ +int +xlog_write( + struct xlog *log, + struct xfs_log_vec *log_vector, + struct xlog_ticket *ticket, + xfs_lsn_t *start_lsn, + struct xlog_in_core **commit_iclog, + uint flags) +{ + struct xlog_in_core *iclog = NULL; + struct xfs_log_iovec *vecp; + struct xfs_log_vec *lv; + int len; + int index; + int partial_copy = 0; + int partial_copy_len = 0; + int contwr = 0; + int record_cnt = 0; + int data_cnt = 0; + int error; + + *start_lsn = 0; + + len = xlog_write_calc_vec_length(ticket, log_vector); + + /* + * Region headers and bytes are already accounted for. + * We only need to take into account start records and + * split regions in this function. + */ + if (ticket->t_flags & XLOG_TIC_INITED) + ticket->t_curr_res -= sizeof(xlog_op_header_t); + + /* + * Commit record headers need to be accounted for. These + * come in as separate writes so are easy to detect. + */ + if (flags & (XLOG_COMMIT_TRANS | XLOG_UNMOUNT_TRANS)) + ticket->t_curr_res -= sizeof(xlog_op_header_t); + + if (ticket->t_curr_res < 0) + xlog_print_tic_res(log->l_mp, ticket); + + index = 0; + lv = log_vector; + vecp = lv->lv_iovecp; + while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) { + void *ptr; + int log_offset; + + error = xlog_state_get_iclog_space(log, len, &iclog, ticket, + &contwr, &log_offset); + if (error) + return error; + + ASSERT(log_offset <= iclog->ic_size - 1); + ptr = iclog->ic_datap + log_offset; + + /* start_lsn is the first lsn written to. That's all we need. */ + if (!*start_lsn) + *start_lsn = be64_to_cpu(iclog->ic_header.h_lsn); + + /* + * This loop writes out as many regions as can fit in the amount + * of space which was allocated by xlog_state_get_iclog_space(). + */ + while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) { + struct xfs_log_iovec *reg; + struct xlog_op_header *ophdr; + int start_rec_copy; + int copy_len; + int copy_off; + bool ordered = false; + + /* ordered log vectors have no regions to write */ + if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) { + ASSERT(lv->lv_niovecs == 0); + ordered = true; + goto next_lv; + } + + reg = &vecp[index]; + ASSERT(reg->i_len % sizeof(__int32_t) == 0); + ASSERT((unsigned long)ptr % sizeof(__int32_t) == 0); + + start_rec_copy = xlog_write_start_rec(ptr, ticket); + if (start_rec_copy) { + record_cnt++; + xlog_write_adv_cnt(&ptr, &len, &log_offset, + start_rec_copy); + } + + ophdr = xlog_write_setup_ophdr(log, ptr, ticket, flags); + if (!ophdr) + return -EIO; + + xlog_write_adv_cnt(&ptr, &len, &log_offset, + sizeof(struct xlog_op_header)); + + len += xlog_write_setup_copy(ticket, ophdr, + iclog->ic_size-log_offset, + reg->i_len, + ©_off, ©_len, + &partial_copy, + &partial_copy_len); + xlog_verify_dest_ptr(log, ptr); + + /* copy region */ + ASSERT(copy_len >= 0); + memcpy(ptr, reg->i_addr + copy_off, copy_len); + xlog_write_adv_cnt(&ptr, &len, &log_offset, copy_len); + + copy_len += start_rec_copy + sizeof(xlog_op_header_t); + record_cnt++; + data_cnt += contwr ? copy_len : 0; + + error = xlog_write_copy_finish(log, iclog, flags, + &record_cnt, &data_cnt, + &partial_copy, + &partial_copy_len, + log_offset, + commit_iclog); + if (error) + return error; + + /* + * if we had a partial copy, we need to get more iclog + * space but we don't want to increment the region + * index because there is still more is this region to + * write. + * + * If we completed writing this region, and we flushed + * the iclog (indicated by resetting of the record + * count), then we also need to get more log space. If + * this was the last record, though, we are done and + * can just return. + */ + if (partial_copy) + break; + + if (++index == lv->lv_niovecs) { +next_lv: + lv = lv->lv_next; + index = 0; + if (lv) + vecp = lv->lv_iovecp; + } + if (record_cnt == 0 && ordered == false) { + if (!lv) + return 0; + break; + } + } + } + + ASSERT(len == 0); + + xlog_state_finish_copy(log, iclog, record_cnt, data_cnt); + if (!commit_iclog) + return xlog_state_release_iclog(log, iclog); + + ASSERT(flags & XLOG_COMMIT_TRANS); + *commit_iclog = iclog; + return 0; +} + + +/***************************************************************************** + * + * State Machine functions + * + ***************************************************************************** + */ + +/* Clean iclogs starting from the head. This ordering must be + * maintained, so an iclog doesn't become ACTIVE beyond one that + * is SYNCING. This is also required to maintain the notion that we use + * a ordered wait queue to hold off would be writers to the log when every + * iclog is trying to sync to disk. + * + * State Change: DIRTY -> ACTIVE + */ +STATIC void +xlog_state_clean_log( + struct xlog *log) +{ + xlog_in_core_t *iclog; + int changed = 0; + + iclog = log->l_iclog; + do { + if (iclog->ic_state == XLOG_STATE_DIRTY) { + iclog->ic_state = XLOG_STATE_ACTIVE; + iclog->ic_offset = 0; + ASSERT(iclog->ic_callback == NULL); + /* + * If the number of ops in this iclog indicate it just + * contains the dummy transaction, we can + * change state into IDLE (the second time around). + * Otherwise we should change the state into + * NEED a dummy. + * We don't need to cover the dummy. + */ + if (!changed && + (be32_to_cpu(iclog->ic_header.h_num_logops) == + XLOG_COVER_OPS)) { + changed = 1; + } else { + /* + * We have two dirty iclogs so start over + * This could also be num of ops indicates + * this is not the dummy going out. + */ + changed = 2; + } + iclog->ic_header.h_num_logops = 0; + memset(iclog->ic_header.h_cycle_data, 0, + sizeof(iclog->ic_header.h_cycle_data)); + iclog->ic_header.h_lsn = 0; + } else if (iclog->ic_state == XLOG_STATE_ACTIVE) + /* do nothing */; + else + break; /* stop cleaning */ + iclog = iclog->ic_next; + } while (iclog != log->l_iclog); + + /* log is locked when we are called */ + /* + * Change state for the dummy log recording. + * We usually go to NEED. But we go to NEED2 if the changed indicates + * we are done writing the dummy record. + * If we are done with the second dummy recored (DONE2), then + * we go to IDLE. + */ + if (changed) { + switch (log->l_covered_state) { + case XLOG_STATE_COVER_IDLE: + case XLOG_STATE_COVER_NEED: + case XLOG_STATE_COVER_NEED2: + log->l_covered_state = XLOG_STATE_COVER_NEED; + break; + + case XLOG_STATE_COVER_DONE: + if (changed == 1) + log->l_covered_state = XLOG_STATE_COVER_NEED2; + else + log->l_covered_state = XLOG_STATE_COVER_NEED; + break; + + case XLOG_STATE_COVER_DONE2: + if (changed == 1) + log->l_covered_state = XLOG_STATE_COVER_IDLE; + else + log->l_covered_state = XLOG_STATE_COVER_NEED; + break; + + default: + ASSERT(0); + } + } +} /* xlog_state_clean_log */ + +STATIC xfs_lsn_t +xlog_get_lowest_lsn( + struct xlog *log) +{ + xlog_in_core_t *lsn_log; + xfs_lsn_t lowest_lsn, lsn; + + lsn_log = log->l_iclog; + lowest_lsn = 0; + do { + if (!(lsn_log->ic_state & (XLOG_STATE_ACTIVE|XLOG_STATE_DIRTY))) { + lsn = be64_to_cpu(lsn_log->ic_header.h_lsn); + if ((lsn && !lowest_lsn) || + (XFS_LSN_CMP(lsn, lowest_lsn) < 0)) { + lowest_lsn = lsn; + } + } + lsn_log = lsn_log->ic_next; + } while (lsn_log != log->l_iclog); + return lowest_lsn; +} + + +STATIC void +xlog_state_do_callback( + struct xlog *log, + int aborted, + struct xlog_in_core *ciclog) +{ + xlog_in_core_t *iclog; + xlog_in_core_t *first_iclog; /* used to know when we've + * processed all iclogs once */ + xfs_log_callback_t *cb, *cb_next; + int flushcnt = 0; + xfs_lsn_t lowest_lsn; + int ioerrors; /* counter: iclogs with errors */ + int loopdidcallbacks; /* flag: inner loop did callbacks*/ + int funcdidcallbacks; /* flag: function did callbacks */ + int repeats; /* for issuing console warnings if + * looping too many times */ + int wake = 0; + + spin_lock(&log->l_icloglock); + first_iclog = iclog = log->l_iclog; + ioerrors = 0; + funcdidcallbacks = 0; + repeats = 0; + + do { + /* + * Scan all iclogs starting with the one pointed to by the + * log. Reset this starting point each time the log is + * unlocked (during callbacks). + * + * Keep looping through iclogs until one full pass is made + * without running any callbacks. + */ + first_iclog = log->l_iclog; + iclog = log->l_iclog; + loopdidcallbacks = 0; + repeats++; + + do { + + /* skip all iclogs in the ACTIVE & DIRTY states */ + if (iclog->ic_state & + (XLOG_STATE_ACTIVE|XLOG_STATE_DIRTY)) { + iclog = iclog->ic_next; + continue; + } + + /* + * Between marking a filesystem SHUTDOWN and stopping + * the log, we do flush all iclogs to disk (if there + * wasn't a log I/O error). So, we do want things to + * go smoothly in case of just a SHUTDOWN w/o a + * LOG_IO_ERROR. + */ + if (!(iclog->ic_state & XLOG_STATE_IOERROR)) { + /* + * Can only perform callbacks in order. Since + * this iclog is not in the DONE_SYNC/ + * DO_CALLBACK state, we skip the rest and + * just try to clean up. If we set our iclog + * to DO_CALLBACK, we will not process it when + * we retry since a previous iclog is in the + * CALLBACK and the state cannot change since + * we are holding the l_icloglock. + */ + if (!(iclog->ic_state & + (XLOG_STATE_DONE_SYNC | + XLOG_STATE_DO_CALLBACK))) { + if (ciclog && (ciclog->ic_state == + XLOG_STATE_DONE_SYNC)) { + ciclog->ic_state = XLOG_STATE_DO_CALLBACK; + } + break; + } + /* + * We now have an iclog that is in either the + * DO_CALLBACK or DONE_SYNC states. The other + * states (WANT_SYNC, SYNCING, or CALLBACK were + * caught by the above if and are going to + * clean (i.e. we aren't doing their callbacks) + * see the above if. + */ + + /* + * We will do one more check here to see if we + * have chased our tail around. + */ + + lowest_lsn = xlog_get_lowest_lsn(log); + if (lowest_lsn && + XFS_LSN_CMP(lowest_lsn, + be64_to_cpu(iclog->ic_header.h_lsn)) < 0) { + iclog = iclog->ic_next; + continue; /* Leave this iclog for + * another thread */ + } + + iclog->ic_state = XLOG_STATE_CALLBACK; + + + /* + * Completion of a iclog IO does not imply that + * a transaction has completed, as transactions + * can be large enough to span many iclogs. We + * cannot change the tail of the log half way + * through a transaction as this may be the only + * transaction in the log and moving th etail to + * point to the middle of it will prevent + * recovery from finding the start of the + * transaction. Hence we should only update the + * last_sync_lsn if this iclog contains + * transaction completion callbacks on it. + * + * We have to do this before we drop the + * icloglock to ensure we are the only one that + * can update it. + */ + ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn), + be64_to_cpu(iclog->ic_header.h_lsn)) <= 0); + if (iclog->ic_callback) + atomic64_set(&log->l_last_sync_lsn, + be64_to_cpu(iclog->ic_header.h_lsn)); + + } else + ioerrors++; + + spin_unlock(&log->l_icloglock); + + /* + * Keep processing entries in the callback list until + * we come around and it is empty. We need to + * atomically see that the list is empty and change the + * state to DIRTY so that we don't miss any more + * callbacks being added. + */ + spin_lock(&iclog->ic_callback_lock); + cb = iclog->ic_callback; + while (cb) { + iclog->ic_callback_tail = &(iclog->ic_callback); + iclog->ic_callback = NULL; + spin_unlock(&iclog->ic_callback_lock); + + /* perform callbacks in the order given */ + for (; cb; cb = cb_next) { + cb_next = cb->cb_next; + cb->cb_func(cb->cb_arg, aborted); + } + spin_lock(&iclog->ic_callback_lock); + cb = iclog->ic_callback; + } + + loopdidcallbacks++; + funcdidcallbacks++; + + spin_lock(&log->l_icloglock); + ASSERT(iclog->ic_callback == NULL); + spin_unlock(&iclog->ic_callback_lock); + if (!(iclog->ic_state & XLOG_STATE_IOERROR)) + iclog->ic_state = XLOG_STATE_DIRTY; + + /* + * Transition from DIRTY to ACTIVE if applicable. + * NOP if STATE_IOERROR. + */ + xlog_state_clean_log(log); + + /* wake up threads waiting in xfs_log_force() */ + wake_up_all(&iclog->ic_force_wait); + + iclog = iclog->ic_next; + } while (first_iclog != iclog); + + if (repeats > 5000) { + flushcnt += repeats; + repeats = 0; + xfs_warn(log->l_mp, + "%s: possible infinite loop (%d iterations)", + __func__, flushcnt); + } + } while (!ioerrors && loopdidcallbacks); + + /* + * make one last gasp attempt to see if iclogs are being left in + * limbo.. + */ +#ifdef DEBUG + if (funcdidcallbacks) { + first_iclog = iclog = log->l_iclog; + do { + ASSERT(iclog->ic_state != XLOG_STATE_DO_CALLBACK); + /* + * Terminate the loop if iclogs are found in states + * which will cause other threads to clean up iclogs. + * + * SYNCING - i/o completion will go through logs + * DONE_SYNC - interrupt thread should be waiting for + * l_icloglock + * IOERROR - give up hope all ye who enter here + */ + if (iclog->ic_state == XLOG_STATE_WANT_SYNC || + iclog->ic_state == XLOG_STATE_SYNCING || + iclog->ic_state == XLOG_STATE_DONE_SYNC || + iclog->ic_state == XLOG_STATE_IOERROR ) + break; + iclog = iclog->ic_next; + } while (first_iclog != iclog); + } +#endif + + if (log->l_iclog->ic_state & (XLOG_STATE_ACTIVE|XLOG_STATE_IOERROR)) + wake = 1; + spin_unlock(&log->l_icloglock); + + if (wake) + wake_up_all(&log->l_flush_wait); +} + + +/* + * Finish transitioning this iclog to the dirty state. + * + * Make sure that we completely execute this routine only when this is + * the last call to the iclog. There is a good chance that iclog flushes, + * when we reach the end of the physical log, get turned into 2 separate + * calls to bwrite. Hence, one iclog flush could generate two calls to this + * routine. By using the reference count bwritecnt, we guarantee that only + * the second completion goes through. + * + * Callbacks could take time, so they are done outside the scope of the + * global state machine log lock. + */ +STATIC void +xlog_state_done_syncing( + xlog_in_core_t *iclog, + int aborted) +{ + struct xlog *log = iclog->ic_log; + + spin_lock(&log->l_icloglock); + + ASSERT(iclog->ic_state == XLOG_STATE_SYNCING || + iclog->ic_state == XLOG_STATE_IOERROR); + ASSERT(atomic_read(&iclog->ic_refcnt) == 0); + ASSERT(iclog->ic_bwritecnt == 1 || iclog->ic_bwritecnt == 2); + + + /* + * If we got an error, either on the first buffer, or in the case of + * split log writes, on the second, we mark ALL iclogs STATE_IOERROR, + * and none should ever be attempted to be written to disk + * again. + */ + if (iclog->ic_state != XLOG_STATE_IOERROR) { + if (--iclog->ic_bwritecnt == 1) { + spin_unlock(&log->l_icloglock); + return; + } + iclog->ic_state = XLOG_STATE_DONE_SYNC; + } + + /* + * Someone could be sleeping prior to writing out the next + * iclog buffer, we wake them all, one will get to do the + * I/O, the others get to wait for the result. + */ + wake_up_all(&iclog->ic_write_wait); + spin_unlock(&log->l_icloglock); + xlog_state_do_callback(log, aborted, iclog); /* also cleans log */ +} /* xlog_state_done_syncing */ + + +/* + * If the head of the in-core log ring is not (ACTIVE or DIRTY), then we must + * sleep. We wait on the flush queue on the head iclog as that should be + * the first iclog to complete flushing. Hence if all iclogs are syncing, + * we will wait here and all new writes will sleep until a sync completes. + * + * The in-core logs are used in a circular fashion. They are not used + * out-of-order even when an iclog past the head is free. + * + * return: + * * log_offset where xlog_write() can start writing into the in-core + * log's data space. + * * in-core log pointer to which xlog_write() should write. + * * boolean indicating this is a continued write to an in-core log. + * If this is the last write, then the in-core log's offset field + * needs to be incremented, depending on the amount of data which + * is copied. + */ +STATIC int +xlog_state_get_iclog_space( + struct xlog *log, + int len, + struct xlog_in_core **iclogp, + struct xlog_ticket *ticket, + int *continued_write, + int *logoffsetp) +{ + int log_offset; + xlog_rec_header_t *head; + xlog_in_core_t *iclog; + int error; + +restart: + spin_lock(&log->l_icloglock); + if (XLOG_FORCED_SHUTDOWN(log)) { + spin_unlock(&log->l_icloglock); + return -EIO; + } + + iclog = log->l_iclog; + if (iclog->ic_state != XLOG_STATE_ACTIVE) { + XFS_STATS_INC(xs_log_noiclogs); + + /* Wait for log writes to have flushed */ + xlog_wait(&log->l_flush_wait, &log->l_icloglock); + goto restart; + } + + head = &iclog->ic_header; + + atomic_inc(&iclog->ic_refcnt); /* prevents sync */ + log_offset = iclog->ic_offset; + + /* On the 1st write to an iclog, figure out lsn. This works + * if iclogs marked XLOG_STATE_WANT_SYNC always write out what they are + * committing to. If the offset is set, that's how many blocks + * must be written. + */ + if (log_offset == 0) { + ticket->t_curr_res -= log->l_iclog_hsize; + xlog_tic_add_region(ticket, + log->l_iclog_hsize, + XLOG_REG_TYPE_LRHEADER); + head->h_cycle = cpu_to_be32(log->l_curr_cycle); + head->h_lsn = cpu_to_be64( + xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block)); + ASSERT(log->l_curr_block >= 0); + } + + /* If there is enough room to write everything, then do it. Otherwise, + * claim the rest of the region and make sure the XLOG_STATE_WANT_SYNC + * bit is on, so this will get flushed out. Don't update ic_offset + * until you know exactly how many bytes get copied. Therefore, wait + * until later to update ic_offset. + * + * xlog_write() algorithm assumes that at least 2 xlog_op_header_t's + * can fit into remaining data section. + */ + if (iclog->ic_size - iclog->ic_offset < 2*sizeof(xlog_op_header_t)) { + xlog_state_switch_iclogs(log, iclog, iclog->ic_size); + + /* + * If I'm the only one writing to this iclog, sync it to disk. + * We need to do an atomic compare and decrement here to avoid + * racing with concurrent atomic_dec_and_lock() calls in + * xlog_state_release_iclog() when there is more than one + * reference to the iclog. + */ + if (!atomic_add_unless(&iclog->ic_refcnt, -1, 1)) { + /* we are the only one */ + spin_unlock(&log->l_icloglock); + error = xlog_state_release_iclog(log, iclog); + if (error) + return error; + } else { + spin_unlock(&log->l_icloglock); + } + goto restart; + } + + /* Do we have enough room to write the full amount in the remainder + * of this iclog? Or must we continue a write on the next iclog and + * mark this iclog as completely taken? In the case where we switch + * iclogs (to mark it taken), this particular iclog will release/sync + * to disk in xlog_write(). + */ + if (len <= iclog->ic_size - iclog->ic_offset) { + *continued_write = 0; + iclog->ic_offset += len; + } else { + *continued_write = 1; + xlog_state_switch_iclogs(log, iclog, iclog->ic_size); + } + *iclogp = iclog; + + ASSERT(iclog->ic_offset <= iclog->ic_size); + spin_unlock(&log->l_icloglock); + + *logoffsetp = log_offset; + return 0; +} /* xlog_state_get_iclog_space */ + +/* The first cnt-1 times through here we don't need to + * move the grant write head because the permanent + * reservation has reserved cnt times the unit amount. + * Release part of current permanent unit reservation and + * reset current reservation to be one units worth. Also + * move grant reservation head forward. + */ +STATIC void +xlog_regrant_reserve_log_space( + struct xlog *log, + struct xlog_ticket *ticket) +{ + trace_xfs_log_regrant_reserve_enter(log, ticket); + + if (ticket->t_cnt > 0) + ticket->t_cnt--; + + xlog_grant_sub_space(log, &log->l_reserve_head.grant, + ticket->t_curr_res); + xlog_grant_sub_space(log, &log->l_write_head.grant, + ticket->t_curr_res); + ticket->t_curr_res = ticket->t_unit_res; + xlog_tic_reset_res(ticket); + + trace_xfs_log_regrant_reserve_sub(log, ticket); + + /* just return if we still have some of the pre-reserved space */ + if (ticket->t_cnt > 0) + return; + + xlog_grant_add_space(log, &log->l_reserve_head.grant, + ticket->t_unit_res); + + trace_xfs_log_regrant_reserve_exit(log, ticket); + + ticket->t_curr_res = ticket->t_unit_res; + xlog_tic_reset_res(ticket); +} /* xlog_regrant_reserve_log_space */ + + +/* + * Give back the space left from a reservation. + * + * All the information we need to make a correct determination of space left + * is present. For non-permanent reservations, things are quite easy. The + * count should have been decremented to zero. We only need to deal with the + * space remaining in the current reservation part of the ticket. If the + * ticket contains a permanent reservation, there may be left over space which + * needs to be released. A count of N means that N-1 refills of the current + * reservation can be done before we need to ask for more space. The first + * one goes to fill up the first current reservation. Once we run out of + * space, the count will stay at zero and the only space remaining will be + * in the current reservation field. + */ +STATIC void +xlog_ungrant_log_space( + struct xlog *log, + struct xlog_ticket *ticket) +{ + int bytes; + + if (ticket->t_cnt > 0) + ticket->t_cnt--; + + trace_xfs_log_ungrant_enter(log, ticket); + trace_xfs_log_ungrant_sub(log, ticket); + + /* + * If this is a permanent reservation ticket, we may be able to free + * up more space based on the remaining count. + */ + bytes = ticket->t_curr_res; + if (ticket->t_cnt > 0) { + ASSERT(ticket->t_flags & XLOG_TIC_PERM_RESERV); + bytes += ticket->t_unit_res*ticket->t_cnt; + } + + xlog_grant_sub_space(log, &log->l_reserve_head.grant, bytes); + xlog_grant_sub_space(log, &log->l_write_head.grant, bytes); + + trace_xfs_log_ungrant_exit(log, ticket); + + xfs_log_space_wake(log->l_mp); +} + +/* + * Flush iclog to disk if this is the last reference to the given iclog and + * the WANT_SYNC bit is set. + * + * When this function is entered, the iclog is not necessarily in the + * WANT_SYNC state. It may be sitting around waiting to get filled. + * + * + */ +STATIC int +xlog_state_release_iclog( + struct xlog *log, + struct xlog_in_core *iclog) +{ + int sync = 0; /* do we sync? */ + + if (iclog->ic_state & XLOG_STATE_IOERROR) + return -EIO; + + ASSERT(atomic_read(&iclog->ic_refcnt) > 0); + if (!atomic_dec_and_lock(&iclog->ic_refcnt, &log->l_icloglock)) + return 0; + + if (iclog->ic_state & XLOG_STATE_IOERROR) { + spin_unlock(&log->l_icloglock); + return -EIO; + } + ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE || + iclog->ic_state == XLOG_STATE_WANT_SYNC); + + if (iclog->ic_state == XLOG_STATE_WANT_SYNC) { + /* update tail before writing to iclog */ + xfs_lsn_t tail_lsn = xlog_assign_tail_lsn(log->l_mp); + sync++; + iclog->ic_state = XLOG_STATE_SYNCING; + iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn); + xlog_verify_tail_lsn(log, iclog, tail_lsn); + /* cycle incremented when incrementing curr_block */ + } + spin_unlock(&log->l_icloglock); + + /* + * We let the log lock go, so it's possible that we hit a log I/O + * error or some other SHUTDOWN condition that marks the iclog + * as XLOG_STATE_IOERROR before the bwrite. However, we know that + * this iclog has consistent data, so we ignore IOERROR + * flags after this point. + */ + if (sync) + return xlog_sync(log, iclog); + return 0; +} /* xlog_state_release_iclog */ + + +/* + * This routine will mark the current iclog in the ring as WANT_SYNC + * and move the current iclog pointer to the next iclog in the ring. + * When this routine is called from xlog_state_get_iclog_space(), the + * exact size of the iclog has not yet been determined. All we know is + * that every data block. We have run out of space in this log record. + */ +STATIC void +xlog_state_switch_iclogs( + struct xlog *log, + struct xlog_in_core *iclog, + int eventual_size) +{ + ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE); + if (!eventual_size) + eventual_size = iclog->ic_offset; + iclog->ic_state = XLOG_STATE_WANT_SYNC; + iclog->ic_header.h_prev_block = cpu_to_be32(log->l_prev_block); + log->l_prev_block = log->l_curr_block; + log->l_prev_cycle = log->l_curr_cycle; + + /* roll log?: ic_offset changed later */ + log->l_curr_block += BTOBB(eventual_size)+BTOBB(log->l_iclog_hsize); + + /* Round up to next log-sunit */ + if (xfs_sb_version_haslogv2(&log->l_mp->m_sb) && + log->l_mp->m_sb.sb_logsunit > 1) { + __uint32_t sunit_bb = BTOBB(log->l_mp->m_sb.sb_logsunit); + log->l_curr_block = roundup(log->l_curr_block, sunit_bb); + } + + if (log->l_curr_block >= log->l_logBBsize) { + log->l_curr_cycle++; + if (log->l_curr_cycle == XLOG_HEADER_MAGIC_NUM) + log->l_curr_cycle++; + log->l_curr_block -= log->l_logBBsize; + ASSERT(log->l_curr_block >= 0); + } + ASSERT(iclog == log->l_iclog); + log->l_iclog = iclog->ic_next; +} /* xlog_state_switch_iclogs */ + +/* + * Write out all data in the in-core log as of this exact moment in time. + * + * Data may be written to the in-core log during this call. However, + * we don't guarantee this data will be written out. A change from past + * implementation means this routine will *not* write out zero length LRs. + * + * Basically, we try and perform an intelligent scan of the in-core logs. + * If we determine there is no flushable data, we just return. There is no + * flushable data if: + * + * 1. the current iclog is active and has no data; the previous iclog + * is in the active or dirty state. + * 2. the current iclog is drity, and the previous iclog is in the + * active or dirty state. + * + * We may sleep if: + * + * 1. the current iclog is not in the active nor dirty state. + * 2. the current iclog dirty, and the previous iclog is not in the + * active nor dirty state. + * 3. the current iclog is active, and there is another thread writing + * to this particular iclog. + * 4. a) the current iclog is active and has no other writers + * b) when we return from flushing out this iclog, it is still + * not in the active nor dirty state. + */ +int +_xfs_log_force( + struct xfs_mount *mp, + uint flags, + int *log_flushed) +{ + struct xlog *log = mp->m_log; + struct xlog_in_core *iclog; + xfs_lsn_t lsn; + + XFS_STATS_INC(xs_log_force); + + xlog_cil_force(log); + + spin_lock(&log->l_icloglock); + + iclog = log->l_iclog; + if (iclog->ic_state & XLOG_STATE_IOERROR) { + spin_unlock(&log->l_icloglock); + return -EIO; + } + + /* If the head iclog is not active nor dirty, we just attach + * ourselves to the head and go to sleep. + */ + if (iclog->ic_state == XLOG_STATE_ACTIVE || + iclog->ic_state == XLOG_STATE_DIRTY) { + /* + * If the head is dirty or (active and empty), then + * we need to look at the previous iclog. If the previous + * iclog is active or dirty we are done. There is nothing + * to sync out. Otherwise, we attach ourselves to the + * previous iclog and go to sleep. + */ + if (iclog->ic_state == XLOG_STATE_DIRTY || + (atomic_read(&iclog->ic_refcnt) == 0 + && iclog->ic_offset == 0)) { + iclog = iclog->ic_prev; + if (iclog->ic_state == XLOG_STATE_ACTIVE || + iclog->ic_state == XLOG_STATE_DIRTY) + goto no_sleep; + else + goto maybe_sleep; + } else { + if (atomic_read(&iclog->ic_refcnt) == 0) { + /* We are the only one with access to this + * iclog. Flush it out now. There should + * be a roundoff of zero to show that someone + * has already taken care of the roundoff from + * the previous sync. + */ + atomic_inc(&iclog->ic_refcnt); + lsn = be64_to_cpu(iclog->ic_header.h_lsn); + xlog_state_switch_iclogs(log, iclog, 0); + spin_unlock(&log->l_icloglock); + + if (xlog_state_release_iclog(log, iclog)) + return -EIO; + + if (log_flushed) + *log_flushed = 1; + spin_lock(&log->l_icloglock); + if (be64_to_cpu(iclog->ic_header.h_lsn) == lsn && + iclog->ic_state != XLOG_STATE_DIRTY) + goto maybe_sleep; + else + goto no_sleep; + } else { + /* Someone else is writing to this iclog. + * Use its call to flush out the data. However, + * the other thread may not force out this LR, + * so we mark it WANT_SYNC. + */ + xlog_state_switch_iclogs(log, iclog, 0); + goto maybe_sleep; + } + } + } + + /* By the time we come around again, the iclog could've been filled + * which would give it another lsn. If we have a new lsn, just + * return because the relevant data has been flushed. + */ +maybe_sleep: + if (flags & XFS_LOG_SYNC) { + /* + * We must check if we're shutting down here, before + * we wait, while we're holding the l_icloglock. + * Then we check again after waking up, in case our + * sleep was disturbed by a bad news. + */ + if (iclog->ic_state & XLOG_STATE_IOERROR) { + spin_unlock(&log->l_icloglock); + return -EIO; + } + XFS_STATS_INC(xs_log_force_sleep); + xlog_wait(&iclog->ic_force_wait, &log->l_icloglock); + /* + * No need to grab the log lock here since we're + * only deciding whether or not to return EIO + * and the memory read should be atomic. + */ + if (iclog->ic_state & XLOG_STATE_IOERROR) + return -EIO; + if (log_flushed) + *log_flushed = 1; + } else { + +no_sleep: + spin_unlock(&log->l_icloglock); + } + return 0; +} + +/* + * Wrapper for _xfs_log_force(), to be used when caller doesn't care + * about errors or whether the log was flushed or not. This is the normal + * interface to use when trying to unpin items or move the log forward. + */ +void +xfs_log_force( + xfs_mount_t *mp, + uint flags) +{ + int error; + + trace_xfs_log_force(mp, 0); + error = _xfs_log_force(mp, flags, NULL); + if (error) + xfs_warn(mp, "%s: error %d returned.", __func__, error); +} + +/* + * Force the in-core log to disk for a specific LSN. + * + * Find in-core log with lsn. + * If it is in the DIRTY state, just return. + * If it is in the ACTIVE state, move the in-core log into the WANT_SYNC + * state and go to sleep or return. + * If it is in any other state, go to sleep or return. + * + * Synchronous forces are implemented with a signal variable. All callers + * to force a given lsn to disk will wait on a the sv attached to the + * specific in-core log. When given in-core log finally completes its + * write to disk, that thread will wake up all threads waiting on the + * sv. + */ +int +_xfs_log_force_lsn( + struct xfs_mount *mp, + xfs_lsn_t lsn, + uint flags, + int *log_flushed) +{ + struct xlog *log = mp->m_log; + struct xlog_in_core *iclog; + int already_slept = 0; + + ASSERT(lsn != 0); + + XFS_STATS_INC(xs_log_force); + + lsn = xlog_cil_force_lsn(log, lsn); + if (lsn == NULLCOMMITLSN) + return 0; + +try_again: + spin_lock(&log->l_icloglock); + iclog = log->l_iclog; + if (iclog->ic_state & XLOG_STATE_IOERROR) { + spin_unlock(&log->l_icloglock); + return -EIO; + } + + do { + if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn) { + iclog = iclog->ic_next; + continue; + } + + if (iclog->ic_state == XLOG_STATE_DIRTY) { + spin_unlock(&log->l_icloglock); + return 0; + } + + if (iclog->ic_state == XLOG_STATE_ACTIVE) { + /* + * We sleep here if we haven't already slept (e.g. + * this is the first time we've looked at the correct + * iclog buf) and the buffer before us is going to + * be sync'ed. The reason for this is that if we + * are doing sync transactions here, by waiting for + * the previous I/O to complete, we can allow a few + * more transactions into this iclog before we close + * it down. + * + * Otherwise, we mark the buffer WANT_SYNC, and bump + * up the refcnt so we can release the log (which + * drops the ref count). The state switch keeps new + * transaction commits from using this buffer. When + * the current commits finish writing into the buffer, + * the refcount will drop to zero and the buffer will + * go out then. + */ + if (!already_slept && + (iclog->ic_prev->ic_state & + (XLOG_STATE_WANT_SYNC | XLOG_STATE_SYNCING))) { + ASSERT(!(iclog->ic_state & XLOG_STATE_IOERROR)); + + XFS_STATS_INC(xs_log_force_sleep); + + xlog_wait(&iclog->ic_prev->ic_write_wait, + &log->l_icloglock); + if (log_flushed) + *log_flushed = 1; + already_slept = 1; + goto try_again; + } + atomic_inc(&iclog->ic_refcnt); + xlog_state_switch_iclogs(log, iclog, 0); + spin_unlock(&log->l_icloglock); + if (xlog_state_release_iclog(log, iclog)) + return -EIO; + if (log_flushed) + *log_flushed = 1; + spin_lock(&log->l_icloglock); + } + + if ((flags & XFS_LOG_SYNC) && /* sleep */ + !(iclog->ic_state & + (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY))) { + /* + * Don't wait on completion if we know that we've + * gotten a log write error. + */ + if (iclog->ic_state & XLOG_STATE_IOERROR) { + spin_unlock(&log->l_icloglock); + return -EIO; + } + XFS_STATS_INC(xs_log_force_sleep); + xlog_wait(&iclog->ic_force_wait, &log->l_icloglock); + /* + * No need to grab the log lock here since we're + * only deciding whether or not to return EIO + * and the memory read should be atomic. + */ + if (iclog->ic_state & XLOG_STATE_IOERROR) + return -EIO; + + if (log_flushed) + *log_flushed = 1; + } else { /* just return */ + spin_unlock(&log->l_icloglock); + } + + return 0; + } while (iclog != log->l_iclog); + + spin_unlock(&log->l_icloglock); + return 0; +} + +/* + * Wrapper for _xfs_log_force_lsn(), to be used when caller doesn't care + * about errors or whether the log was flushed or not. This is the normal + * interface to use when trying to unpin items or move the log forward. + */ +void +xfs_log_force_lsn( + xfs_mount_t *mp, + xfs_lsn_t lsn, + uint flags) +{ + int error; + + trace_xfs_log_force(mp, lsn); + error = _xfs_log_force_lsn(mp, lsn, flags, NULL); + if (error) + xfs_warn(mp, "%s: error %d returned.", __func__, error); +} + +/* + * Called when we want to mark the current iclog as being ready to sync to + * disk. + */ +STATIC void +xlog_state_want_sync( + struct xlog *log, + struct xlog_in_core *iclog) +{ + assert_spin_locked(&log->l_icloglock); + + if (iclog->ic_state == XLOG_STATE_ACTIVE) { + xlog_state_switch_iclogs(log, iclog, 0); + } else { + ASSERT(iclog->ic_state & + (XLOG_STATE_WANT_SYNC|XLOG_STATE_IOERROR)); + } +} + + +/***************************************************************************** + * + * TICKET functions + * + ***************************************************************************** + */ + +/* + * Free a used ticket when its refcount falls to zero. + */ +void +xfs_log_ticket_put( + xlog_ticket_t *ticket) +{ + ASSERT(atomic_read(&ticket->t_ref) > 0); + if (atomic_dec_and_test(&ticket->t_ref)) + kmem_zone_free(xfs_log_ticket_zone, ticket); +} + +xlog_ticket_t * +xfs_log_ticket_get( + xlog_ticket_t *ticket) +{ + ASSERT(atomic_read(&ticket->t_ref) > 0); + atomic_inc(&ticket->t_ref); + return ticket; +} + +/* + * Figure out the total log space unit (in bytes) that would be + * required for a log ticket. + */ +int +xfs_log_calc_unit_res( + struct xfs_mount *mp, + int unit_bytes) +{ + struct xlog *log = mp->m_log; + int iclog_space; + uint num_headers; + + /* + * Permanent reservations have up to 'cnt'-1 active log operations + * in the log. A unit in this case is the amount of space for one + * of these log operations. Normal reservations have a cnt of 1 + * and their unit amount is the total amount of space required. + * + * The following lines of code account for non-transaction data + * which occupy space in the on-disk log. + * + * Normal form of a transaction is: + * ... + * and then there are LR hdrs, split-recs and roundoff at end of syncs. + * + * We need to account for all the leadup data and trailer data + * around the transaction data. + * And then we need to account for the worst case in terms of using + * more space. + * The worst case will happen if: + * - the placement of the transaction happens to be such that the + * roundoff is at its maximum + * - the transaction data is synced before the commit record is synced + * i.e. | + * Therefore the commit record is in its own Log Record. + * This can happen as the commit record is called with its + * own region to xlog_write(). + * This then means that in the worst case, roundoff can happen for + * the commit-rec as well. + * The commit-rec is smaller than padding in this scenario and so it is + * not added separately. + */ + + /* for trans header */ + unit_bytes += sizeof(xlog_op_header_t); + unit_bytes += sizeof(xfs_trans_header_t); + + /* for start-rec */ + unit_bytes += sizeof(xlog_op_header_t); + + /* + * for LR headers - the space for data in an iclog is the size minus + * the space used for the headers. If we use the iclog size, then we + * undercalculate the number of headers required. + * + * Furthermore - the addition of op headers for split-recs might + * increase the space required enough to require more log and op + * headers, so take that into account too. + * + * IMPORTANT: This reservation makes the assumption that if this + * transaction is the first in an iclog and hence has the LR headers + * accounted to it, then the remaining space in the iclog is + * exclusively for this transaction. i.e. if the transaction is larger + * than the iclog, it will be the only thing in that iclog. + * Fundamentally, this means we must pass the entire log vector to + * xlog_write to guarantee this. + */ + iclog_space = log->l_iclog_size - log->l_iclog_hsize; + num_headers = howmany(unit_bytes, iclog_space); + + /* for split-recs - ophdrs added when data split over LRs */ + unit_bytes += sizeof(xlog_op_header_t) * num_headers; + + /* add extra header reservations if we overrun */ + while (!num_headers || + howmany(unit_bytes, iclog_space) > num_headers) { + unit_bytes += sizeof(xlog_op_header_t); + num_headers++; + } + unit_bytes += log->l_iclog_hsize * num_headers; + + /* for commit-rec LR header - note: padding will subsume the ophdr */ + unit_bytes += log->l_iclog_hsize; + + /* for roundoff padding for transaction data and one for commit record */ + if (xfs_sb_version_haslogv2(&mp->m_sb) && mp->m_sb.sb_logsunit > 1) { + /* log su roundoff */ + unit_bytes += 2 * mp->m_sb.sb_logsunit; + } else { + /* BB roundoff */ + unit_bytes += 2 * BBSIZE; + } + + return unit_bytes; +} + +/* + * Allocate and initialise a new log ticket. + */ +struct xlog_ticket * +xlog_ticket_alloc( + struct xlog *log, + int unit_bytes, + int cnt, + char client, + bool permanent, + xfs_km_flags_t alloc_flags) +{ + struct xlog_ticket *tic; + int unit_res; + + tic = kmem_zone_zalloc(xfs_log_ticket_zone, alloc_flags); + if (!tic) + return NULL; + + unit_res = xfs_log_calc_unit_res(log->l_mp, unit_bytes); + + atomic_set(&tic->t_ref, 1); + tic->t_task = current; + INIT_LIST_HEAD(&tic->t_queue); + tic->t_unit_res = unit_res; + tic->t_curr_res = unit_res; + tic->t_cnt = cnt; + tic->t_ocnt = cnt; + tic->t_tid = prandom_u32(); + tic->t_clientid = client; + tic->t_flags = XLOG_TIC_INITED; + tic->t_trans_type = 0; + if (permanent) + tic->t_flags |= XLOG_TIC_PERM_RESERV; + + xlog_tic_reset_res(tic); + + return tic; +} + + +/****************************************************************************** + * + * Log debug routines + * + ****************************************************************************** + */ +#if defined(DEBUG) +/* + * Make sure that the destination ptr is within the valid data region of + * one of the iclogs. This uses backup pointers stored in a different + * part of the log in case we trash the log structure. + */ +void +xlog_verify_dest_ptr( + struct xlog *log, + char *ptr) +{ + int i; + int good_ptr = 0; + + for (i = 0; i < log->l_iclog_bufs; i++) { + if (ptr >= log->l_iclog_bak[i] && + ptr <= log->l_iclog_bak[i] + log->l_iclog_size) + good_ptr++; + } + + if (!good_ptr) + xfs_emerg(log->l_mp, "%s: invalid ptr", __func__); +} + +/* + * Check to make sure the grant write head didn't just over lap the tail. If + * the cycles are the same, we can't be overlapping. Otherwise, make sure that + * the cycles differ by exactly one and check the byte count. + * + * This check is run unlocked, so can give false positives. Rather than assert + * on failures, use a warn-once flag and a panic tag to allow the admin to + * determine if they want to panic the machine when such an error occurs. For + * debug kernels this will have the same effect as using an assert but, unlinke + * an assert, it can be turned off at runtime. + */ +STATIC void +xlog_verify_grant_tail( + struct xlog *log) +{ + int tail_cycle, tail_blocks; + int cycle, space; + + xlog_crack_grant_head(&log->l_write_head.grant, &cycle, &space); + xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_blocks); + if (tail_cycle != cycle) { + if (cycle - 1 != tail_cycle && + !(log->l_flags & XLOG_TAIL_WARN)) { + xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES, + "%s: cycle - 1 != tail_cycle", __func__); + log->l_flags |= XLOG_TAIL_WARN; + } + + if (space > BBTOB(tail_blocks) && + !(log->l_flags & XLOG_TAIL_WARN)) { + xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES, + "%s: space > BBTOB(tail_blocks)", __func__); + log->l_flags |= XLOG_TAIL_WARN; + } + } +} + +/* check if it will fit */ +STATIC void +xlog_verify_tail_lsn( + struct xlog *log, + struct xlog_in_core *iclog, + xfs_lsn_t tail_lsn) +{ + int blocks; + + if (CYCLE_LSN(tail_lsn) == log->l_prev_cycle) { + blocks = + log->l_logBBsize - (log->l_prev_block - BLOCK_LSN(tail_lsn)); + if (blocks < BTOBB(iclog->ic_offset)+BTOBB(log->l_iclog_hsize)) + xfs_emerg(log->l_mp, "%s: ran out of log space", __func__); + } else { + ASSERT(CYCLE_LSN(tail_lsn)+1 == log->l_prev_cycle); + + if (BLOCK_LSN(tail_lsn) == log->l_prev_block) + xfs_emerg(log->l_mp, "%s: tail wrapped", __func__); + + blocks = BLOCK_LSN(tail_lsn) - log->l_prev_block; + if (blocks < BTOBB(iclog->ic_offset) + 1) + xfs_emerg(log->l_mp, "%s: ran out of log space", __func__); + } +} /* xlog_verify_tail_lsn */ + +/* + * Perform a number of checks on the iclog before writing to disk. + * + * 1. Make sure the iclogs are still circular + * 2. Make sure we have a good magic number + * 3. Make sure we don't have magic numbers in the data + * 4. Check fields of each log operation header for: + * A. Valid client identifier + * B. tid ptr value falls in valid ptr space (user space code) + * C. Length in log record header is correct according to the + * individual operation headers within record. + * 5. When a bwrite will occur within 5 blocks of the front of the physical + * log, check the preceding blocks of the physical log to make sure all + * the cycle numbers agree with the current cycle number. + */ +STATIC void +xlog_verify_iclog( + struct xlog *log, + struct xlog_in_core *iclog, + int count, + bool syncing) +{ + xlog_op_header_t *ophead; + xlog_in_core_t *icptr; + xlog_in_core_2_t *xhdr; + xfs_caddr_t ptr; + xfs_caddr_t base_ptr; + __psint_t field_offset; + __uint8_t clientid; + int len, i, j, k, op_len; + int idx; + + /* check validity of iclog pointers */ + spin_lock(&log->l_icloglock); + icptr = log->l_iclog; + for (i = 0; i < log->l_iclog_bufs; i++, icptr = icptr->ic_next) + ASSERT(icptr); + + if (icptr != log->l_iclog) + xfs_emerg(log->l_mp, "%s: corrupt iclog ring", __func__); + spin_unlock(&log->l_icloglock); + + /* check log magic numbers */ + if (iclog->ic_header.h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) + xfs_emerg(log->l_mp, "%s: invalid magic num", __func__); + + ptr = (xfs_caddr_t) &iclog->ic_header; + for (ptr += BBSIZE; ptr < ((xfs_caddr_t)&iclog->ic_header) + count; + ptr += BBSIZE) { + if (*(__be32 *)ptr == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) + xfs_emerg(log->l_mp, "%s: unexpected magic num", + __func__); + } + + /* check fields */ + len = be32_to_cpu(iclog->ic_header.h_num_logops); + ptr = iclog->ic_datap; + base_ptr = ptr; + ophead = (xlog_op_header_t *)ptr; + xhdr = iclog->ic_data; + for (i = 0; i < len; i++) { + ophead = (xlog_op_header_t *)ptr; + + /* clientid is only 1 byte */ + field_offset = (__psint_t) + ((xfs_caddr_t)&(ophead->oh_clientid) - base_ptr); + if (!syncing || (field_offset & 0x1ff)) { + clientid = ophead->oh_clientid; + } else { + idx = BTOBBT((xfs_caddr_t)&(ophead->oh_clientid) - iclog->ic_datap); + if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) { + j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); + k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); + clientid = xlog_get_client_id( + xhdr[j].hic_xheader.xh_cycle_data[k]); + } else { + clientid = xlog_get_client_id( + iclog->ic_header.h_cycle_data[idx]); + } + } + if (clientid != XFS_TRANSACTION && clientid != XFS_LOG) + xfs_warn(log->l_mp, + "%s: invalid clientid %d op 0x%p offset 0x%lx", + __func__, clientid, ophead, + (unsigned long)field_offset); + + /* check length */ + field_offset = (__psint_t) + ((xfs_caddr_t)&(ophead->oh_len) - base_ptr); + if (!syncing || (field_offset & 0x1ff)) { + op_len = be32_to_cpu(ophead->oh_len); + } else { + idx = BTOBBT((__psint_t)&ophead->oh_len - + (__psint_t)iclog->ic_datap); + if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) { + j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); + k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); + op_len = be32_to_cpu(xhdr[j].hic_xheader.xh_cycle_data[k]); + } else { + op_len = be32_to_cpu(iclog->ic_header.h_cycle_data[idx]); + } + } + ptr += sizeof(xlog_op_header_t) + op_len; + } +} /* xlog_verify_iclog */ +#endif + +/* + * Mark all iclogs IOERROR. l_icloglock is held by the caller. + */ +STATIC int +xlog_state_ioerror( + struct xlog *log) +{ + xlog_in_core_t *iclog, *ic; + + iclog = log->l_iclog; + if (! (iclog->ic_state & XLOG_STATE_IOERROR)) { + /* + * Mark all the incore logs IOERROR. + * From now on, no log flushes will result. + */ + ic = iclog; + do { + ic->ic_state = XLOG_STATE_IOERROR; + ic = ic->ic_next; + } while (ic != iclog); + return 0; + } + /* + * Return non-zero, if state transition has already happened. + */ + return 1; +} + +/* + * This is called from xfs_force_shutdown, when we're forcibly + * shutting down the filesystem, typically because of an IO error. + * Our main objectives here are to make sure that: + * a. if !logerror, flush the logs to disk. Anything modified + * after this is ignored. + * b. the filesystem gets marked 'SHUTDOWN' for all interested + * parties to find out, 'atomically'. + * c. those who're sleeping on log reservations, pinned objects and + * other resources get woken up, and be told the bad news. + * d. nothing new gets queued up after (b) and (c) are done. + * + * Note: for the !logerror case we need to flush the regions held in memory out + * to disk first. This needs to be done before the log is marked as shutdown, + * otherwise the iclog writes will fail. + */ +int +xfs_log_force_umount( + struct xfs_mount *mp, + int logerror) +{ + struct xlog *log; + int retval; + + log = mp->m_log; + + /* + * If this happens during log recovery, don't worry about + * locking; the log isn't open for business yet. + */ + if (!log || + log->l_flags & XLOG_ACTIVE_RECOVERY) { + mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN; + if (mp->m_sb_bp) + XFS_BUF_DONE(mp->m_sb_bp); + return 0; + } + + /* + * Somebody could've already done the hard work for us. + * No need to get locks for this. + */ + if (logerror && log->l_iclog->ic_state & XLOG_STATE_IOERROR) { + ASSERT(XLOG_FORCED_SHUTDOWN(log)); + return 1; + } + + /* + * Flush all the completed transactions to disk before marking the log + * being shut down. We need to do it in this order to ensure that + * completed operations are safely on disk before we shut down, and that + * we don't have to issue any buffer IO after the shutdown flags are set + * to guarantee this. + */ + if (!logerror) + _xfs_log_force(mp, XFS_LOG_SYNC, NULL); + + /* + * mark the filesystem and the as in a shutdown state and wake + * everybody up to tell them the bad news. + */ + spin_lock(&log->l_icloglock); + mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN; + if (mp->m_sb_bp) + XFS_BUF_DONE(mp->m_sb_bp); + + /* + * Mark the log and the iclogs with IO error flags to prevent any + * further log IO from being issued or completed. + */ + log->l_flags |= XLOG_IO_ERROR; + retval = xlog_state_ioerror(log); + spin_unlock(&log->l_icloglock); + + /* + * We don't want anybody waiting for log reservations after this. That + * means we have to wake up everybody queued up on reserveq as well as + * writeq. In addition, we make sure in xlog_{re}grant_log_space that + * we don't enqueue anything once the SHUTDOWN flag is set, and this + * action is protected by the grant locks. + */ + xlog_grant_head_wake_all(&log->l_reserve_head); + xlog_grant_head_wake_all(&log->l_write_head); + + /* + * Wake up everybody waiting on xfs_log_force. Wake the CIL push first + * as if the log writes were completed. The abort handling in the log + * item committed callback functions will do this again under lock to + * avoid races. + */ + wake_up_all(&log->l_cilp->xc_commit_wait); + xlog_state_do_callback(log, XFS_LI_ABORTED, NULL); + +#ifdef XFSERRORDEBUG + { + xlog_in_core_t *iclog; + + spin_lock(&log->l_icloglock); + iclog = log->l_iclog; + do { + ASSERT(iclog->ic_callback == 0); + iclog = iclog->ic_next; + } while (iclog != log->l_iclog); + spin_unlock(&log->l_icloglock); + } +#endif + /* return non-zero if log IOERROR transition had already happened */ + return retval; +} + +STATIC int +xlog_iclogs_empty( + struct xlog *log) +{ + xlog_in_core_t *iclog; + + iclog = log->l_iclog; + do { + /* endianness does not matter here, zero is zero in + * any language. + */ + if (iclog->ic_header.h_num_logops) + return 0; + iclog = iclog->ic_next; + } while (iclog != log->l_iclog); + return 1; +} + diff --git a/kernel/fs/xfs/xfs_log.h b/kernel/fs/xfs/xfs_log.h new file mode 100644 index 000000000..84e0deb95 --- /dev/null +++ b/kernel/fs/xfs/xfs_log.h @@ -0,0 +1,193 @@ +/* + * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_LOG_H__ +#define __XFS_LOG_H__ + +struct xfs_log_vec { + struct xfs_log_vec *lv_next; /* next lv in build list */ + int lv_niovecs; /* number of iovecs in lv */ + struct xfs_log_iovec *lv_iovecp; /* iovec array */ + struct xfs_log_item *lv_item; /* owner */ + char *lv_buf; /* formatted buffer */ + int lv_bytes; /* accounted space in buffer */ + int lv_buf_len; /* aligned size of buffer */ + int lv_size; /* size of allocated lv */ +}; + +#define XFS_LOG_VEC_ORDERED (-1) + +static inline void * +xlog_prepare_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp, + uint type) +{ + struct xfs_log_iovec *vec = *vecp; + + if (vec) { + ASSERT(vec - lv->lv_iovecp < lv->lv_niovecs); + vec++; + } else { + vec = &lv->lv_iovecp[0]; + } + + vec->i_type = type; + vec->i_addr = lv->lv_buf + lv->lv_buf_len; + + ASSERT(IS_ALIGNED((unsigned long)vec->i_addr, sizeof(uint64_t))); + + *vecp = vec; + return vec->i_addr; +} + +/* + * We need to make sure the next buffer is naturally aligned for the biggest + * basic data type we put into it. We already accounted for this padding when + * sizing the buffer. + * + * However, this padding does not get written into the log, and hence we have to + * track the space used by the log vectors separately to prevent log space hangs + * due to inaccurate accounting (i.e. a leak) of the used log space through the + * CIL context ticket. + */ +static inline void +xlog_finish_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec *vec, int len) +{ + lv->lv_buf_len += round_up(len, sizeof(uint64_t)); + lv->lv_bytes += len; + vec->i_len = len; +} + +static inline void * +xlog_copy_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp, + uint type, void *data, int len) +{ + void *buf; + + buf = xlog_prepare_iovec(lv, vecp, type); + memcpy(buf, data, len); + xlog_finish_iovec(lv, *vecp, len); + return buf; +} + +/* + * Structure used to pass callback function and the function's argument + * to the log manager. + */ +typedef struct xfs_log_callback { + struct xfs_log_callback *cb_next; + void (*cb_func)(void *, int); + void *cb_arg; +} xfs_log_callback_t; + +/* + * By comparing each component, we don't have to worry about extra + * endian issues in treating two 32 bit numbers as one 64 bit number + */ +static inline xfs_lsn_t _lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2) +{ + if (CYCLE_LSN(lsn1) != CYCLE_LSN(lsn2)) + return (CYCLE_LSN(lsn1)t_trans_type = XFS_TRANS_CHECKPOINT; + + /* + * set the current reservation to zero so we know to steal the basic + * transaction overhead reservation from the first transaction commit. + */ + tic->t_curr_res = 0; + return tic; +} + +/* + * After the first stage of log recovery is done, we know where the head and + * tail of the log are. We need this log initialisation done before we can + * initialise the first CIL checkpoint context. + * + * Here we allocate a log ticket to track space usage during a CIL push. This + * ticket is passed to xlog_write() directly so that we don't slowly leak log + * space by failing to account for space used by log headers and additional + * region headers for split regions. + */ +void +xlog_cil_init_post_recovery( + struct xlog *log) +{ + log->l_cilp->xc_ctx->ticket = xlog_cil_ticket_alloc(log); + log->l_cilp->xc_ctx->sequence = 1; +} + +/* + * Prepare the log item for insertion into the CIL. Calculate the difference in + * log space and vectors it will consume, and if it is a new item pin it as + * well. + */ +STATIC void +xfs_cil_prepare_item( + struct xlog *log, + struct xfs_log_vec *lv, + struct xfs_log_vec *old_lv, + int *diff_len, + int *diff_iovecs) +{ + /* Account for the new LV being passed in */ + if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED) { + *diff_len += lv->lv_bytes; + *diff_iovecs += lv->lv_niovecs; + } + + /* + * If there is no old LV, this is the first time we've seen the item in + * this CIL context and so we need to pin it. If we are replacing the + * old_lv, then remove the space it accounts for and free it. + */ + if (!old_lv) + lv->lv_item->li_ops->iop_pin(lv->lv_item); + else if (old_lv != lv) { + ASSERT(lv->lv_buf_len != XFS_LOG_VEC_ORDERED); + + *diff_len -= old_lv->lv_bytes; + *diff_iovecs -= old_lv->lv_niovecs; + kmem_free(old_lv); + } + + /* attach new log vector to log item */ + lv->lv_item->li_lv = lv; + + /* + * If this is the first time the item is being committed to the + * CIL, store the sequence number on the log item so we can + * tell in future commits whether this is the first checkpoint + * the item is being committed into. + */ + if (!lv->lv_item->li_seq) + lv->lv_item->li_seq = log->l_cilp->xc_ctx->sequence; +} + +/* + * Format log item into a flat buffers + * + * For delayed logging, we need to hold a formatted buffer containing all the + * changes on the log item. This enables us to relog the item in memory and + * write it out asynchronously without needing to relock the object that was + * modified at the time it gets written into the iclog. + * + * This function builds a vector for the changes in each log item in the + * transaction. It then works out the length of the buffer needed for each log + * item, allocates them and formats the vector for the item into the buffer. + * The buffer is then attached to the log item are then inserted into the + * Committed Item List for tracking until the next checkpoint is written out. + * + * We don't set up region headers during this process; we simply copy the + * regions into the flat buffer. We can do this because we still have to do a + * formatting step to write the regions into the iclog buffer. Writing the + * ophdrs during the iclog write means that we can support splitting large + * regions across iclog boundares without needing a change in the format of the + * item/region encapsulation. + * + * Hence what we need to do now is change the rewrite the vector array to point + * to the copied region inside the buffer we just allocated. This allows us to + * format the regions into the iclog as though they are being formatted + * directly out of the objects themselves. + */ +static void +xlog_cil_insert_format_items( + struct xlog *log, + struct xfs_trans *tp, + int *diff_len, + int *diff_iovecs) +{ + struct xfs_log_item_desc *lidp; + + + /* Bail out if we didn't find a log item. */ + if (list_empty(&tp->t_items)) { + ASSERT(0); + return; + } + + list_for_each_entry(lidp, &tp->t_items, lid_trans) { + struct xfs_log_item *lip = lidp->lid_item; + struct xfs_log_vec *lv; + struct xfs_log_vec *old_lv; + int niovecs = 0; + int nbytes = 0; + int buf_size; + bool ordered = false; + + /* Skip items which aren't dirty in this transaction. */ + if (!(lidp->lid_flags & XFS_LID_DIRTY)) + continue; + + /* get number of vecs and size of data to be stored */ + lip->li_ops->iop_size(lip, &niovecs, &nbytes); + + /* Skip items that do not have any vectors for writing */ + if (!niovecs) + continue; + + /* + * Ordered items need to be tracked but we do not wish to write + * them. We need a logvec to track the object, but we do not + * need an iovec or buffer to be allocated for copying data. + */ + if (niovecs == XFS_LOG_VEC_ORDERED) { + ordered = true; + niovecs = 0; + nbytes = 0; + } + + /* + * We 64-bit align the length of each iovec so that the start + * of the next one is naturally aligned. We'll need to + * account for that slack space here. Then round nbytes up + * to 64-bit alignment so that the initial buffer alignment is + * easy to calculate and verify. + */ + nbytes += niovecs * sizeof(uint64_t); + nbytes = round_up(nbytes, sizeof(uint64_t)); + + /* grab the old item if it exists for reservation accounting */ + old_lv = lip->li_lv; + + /* + * The data buffer needs to start 64-bit aligned, so round up + * that space to ensure we can align it appropriately and not + * overrun the buffer. + */ + buf_size = nbytes + + round_up((sizeof(struct xfs_log_vec) + + niovecs * sizeof(struct xfs_log_iovec)), + sizeof(uint64_t)); + + /* compare to existing item size */ + if (lip->li_lv && buf_size <= lip->li_lv->lv_size) { + /* same or smaller, optimise common overwrite case */ + lv = lip->li_lv; + lv->lv_next = NULL; + + if (ordered) + goto insert; + + /* + * set the item up as though it is a new insertion so + * that the space reservation accounting is correct. + */ + *diff_iovecs -= lv->lv_niovecs; + *diff_len -= lv->lv_bytes; + } else { + /* allocate new data chunk */ + lv = kmem_zalloc(buf_size, KM_SLEEP|KM_NOFS); + lv->lv_item = lip; + lv->lv_size = buf_size; + if (ordered) { + /* track as an ordered logvec */ + ASSERT(lip->li_lv == NULL); + lv->lv_buf_len = XFS_LOG_VEC_ORDERED; + goto insert; + } + lv->lv_iovecp = (struct xfs_log_iovec *)&lv[1]; + } + + /* Ensure the lv is set up according to ->iop_size */ + lv->lv_niovecs = niovecs; + + /* The allocated data region lies beyond the iovec region */ + lv->lv_buf_len = 0; + lv->lv_bytes = 0; + lv->lv_buf = (char *)lv + buf_size - nbytes; + ASSERT(IS_ALIGNED((unsigned long)lv->lv_buf, sizeof(uint64_t))); + + lip->li_ops->iop_format(lip, lv); +insert: + ASSERT(lv->lv_buf_len <= nbytes); + xfs_cil_prepare_item(log, lv, old_lv, diff_len, diff_iovecs); + } +} + +/* + * Insert the log items into the CIL and calculate the difference in space + * consumed by the item. Add the space to the checkpoint ticket and calculate + * if the change requires additional log metadata. If it does, take that space + * as well. Remove the amount of space we added to the checkpoint ticket from + * the current transaction ticket so that the accounting works out correctly. + */ +static void +xlog_cil_insert_items( + struct xlog *log, + struct xfs_trans *tp) +{ + struct xfs_cil *cil = log->l_cilp; + struct xfs_cil_ctx *ctx = cil->xc_ctx; + struct xfs_log_item_desc *lidp; + int len = 0; + int diff_iovecs = 0; + int iclog_space; + + ASSERT(tp); + + /* + * We can do this safely because the context can't checkpoint until we + * are done so it doesn't matter exactly how we update the CIL. + */ + xlog_cil_insert_format_items(log, tp, &len, &diff_iovecs); + + /* + * Now (re-)position everything modified at the tail of the CIL. + * We do this here so we only need to take the CIL lock once during + * the transaction commit. + */ + spin_lock(&cil->xc_cil_lock); + list_for_each_entry(lidp, &tp->t_items, lid_trans) { + struct xfs_log_item *lip = lidp->lid_item; + + /* Skip items which aren't dirty in this transaction. */ + if (!(lidp->lid_flags & XFS_LID_DIRTY)) + continue; + + list_move_tail(&lip->li_cil, &cil->xc_cil); + } + + /* account for space used by new iovec headers */ + len += diff_iovecs * sizeof(xlog_op_header_t); + ctx->nvecs += diff_iovecs; + + /* attach the transaction to the CIL if it has any busy extents */ + if (!list_empty(&tp->t_busy)) + list_splice_init(&tp->t_busy, &ctx->busy_extents); + + /* + * Now transfer enough transaction reservation to the context ticket + * for the checkpoint. The context ticket is special - the unit + * reservation has to grow as well as the current reservation as we + * steal from tickets so we can correctly determine the space used + * during the transaction commit. + */ + if (ctx->ticket->t_curr_res == 0) { + ctx->ticket->t_curr_res = ctx->ticket->t_unit_res; + tp->t_ticket->t_curr_res -= ctx->ticket->t_unit_res; + } + + /* do we need space for more log record headers? */ + iclog_space = log->l_iclog_size - log->l_iclog_hsize; + if (len > 0 && (ctx->space_used / iclog_space != + (ctx->space_used + len) / iclog_space)) { + int hdrs; + + hdrs = (len + iclog_space - 1) / iclog_space; + /* need to take into account split region headers, too */ + hdrs *= log->l_iclog_hsize + sizeof(struct xlog_op_header); + ctx->ticket->t_unit_res += hdrs; + ctx->ticket->t_curr_res += hdrs; + tp->t_ticket->t_curr_res -= hdrs; + ASSERT(tp->t_ticket->t_curr_res >= len); + } + tp->t_ticket->t_curr_res -= len; + ctx->space_used += len; + + spin_unlock(&cil->xc_cil_lock); +} + +static void +xlog_cil_free_logvec( + struct xfs_log_vec *log_vector) +{ + struct xfs_log_vec *lv; + + for (lv = log_vector; lv; ) { + struct xfs_log_vec *next = lv->lv_next; + kmem_free(lv); + lv = next; + } +} + +/* + * Mark all items committed and clear busy extents. We free the log vector + * chains in a separate pass so that we unpin the log items as quickly as + * possible. + */ +static void +xlog_cil_committed( + void *args, + int abort) +{ + struct xfs_cil_ctx *ctx = args; + struct xfs_mount *mp = ctx->cil->xc_log->l_mp; + + xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, ctx->lv_chain, + ctx->start_lsn, abort); + + xfs_extent_busy_sort(&ctx->busy_extents); + xfs_extent_busy_clear(mp, &ctx->busy_extents, + (mp->m_flags & XFS_MOUNT_DISCARD) && !abort); + + /* + * If we are aborting the commit, wake up anyone waiting on the + * committing list. If we don't, then a shutdown we can leave processes + * waiting in xlog_cil_force_lsn() waiting on a sequence commit that + * will never happen because we aborted it. + */ + spin_lock(&ctx->cil->xc_push_lock); + if (abort) + wake_up_all(&ctx->cil->xc_commit_wait); + list_del(&ctx->committing); + spin_unlock(&ctx->cil->xc_push_lock); + + xlog_cil_free_logvec(ctx->lv_chain); + + if (!list_empty(&ctx->busy_extents)) { + ASSERT(mp->m_flags & XFS_MOUNT_DISCARD); + + xfs_discard_extents(mp, &ctx->busy_extents); + xfs_extent_busy_clear(mp, &ctx->busy_extents, false); + } + + kmem_free(ctx); +} + +/* + * Push the Committed Item List to the log. If @push_seq flag is zero, then it + * is a background flush and so we can chose to ignore it. Otherwise, if the + * current sequence is the same as @push_seq we need to do a flush. If + * @push_seq is less than the current sequence, then it has already been + * flushed and we don't need to do anything - the caller will wait for it to + * complete if necessary. + * + * @push_seq is a value rather than a flag because that allows us to do an + * unlocked check of the sequence number for a match. Hence we can allows log + * forces to run racily and not issue pushes for the same sequence twice. If we + * get a race between multiple pushes for the same sequence they will block on + * the first one and then abort, hence avoiding needless pushes. + */ +STATIC int +xlog_cil_push( + struct xlog *log) +{ + struct xfs_cil *cil = log->l_cilp; + struct xfs_log_vec *lv; + struct xfs_cil_ctx *ctx; + struct xfs_cil_ctx *new_ctx; + struct xlog_in_core *commit_iclog; + struct xlog_ticket *tic; + int num_iovecs; + int error = 0; + struct xfs_trans_header thdr; + struct xfs_log_iovec lhdr; + struct xfs_log_vec lvhdr = { NULL }; + xfs_lsn_t commit_lsn; + xfs_lsn_t push_seq; + + if (!cil) + return 0; + + new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS); + new_ctx->ticket = xlog_cil_ticket_alloc(log); + + down_write(&cil->xc_ctx_lock); + ctx = cil->xc_ctx; + + spin_lock(&cil->xc_push_lock); + push_seq = cil->xc_push_seq; + ASSERT(push_seq <= ctx->sequence); + + /* + * Check if we've anything to push. If there is nothing, then we don't + * move on to a new sequence number and so we have to be able to push + * this sequence again later. + */ + if (list_empty(&cil->xc_cil)) { + cil->xc_push_seq = 0; + spin_unlock(&cil->xc_push_lock); + goto out_skip; + } + + + /* check for a previously pushed seqeunce */ + if (push_seq < cil->xc_ctx->sequence) { + spin_unlock(&cil->xc_push_lock); + goto out_skip; + } + + /* + * We are now going to push this context, so add it to the committing + * list before we do anything else. This ensures that anyone waiting on + * this push can easily detect the difference between a "push in + * progress" and "CIL is empty, nothing to do". + * + * IOWs, a wait loop can now check for: + * the current sequence not being found on the committing list; + * an empty CIL; and + * an unchanged sequence number + * to detect a push that had nothing to do and therefore does not need + * waiting on. If the CIL is not empty, we get put on the committing + * list before emptying the CIL and bumping the sequence number. Hence + * an empty CIL and an unchanged sequence number means we jumped out + * above after doing nothing. + * + * Hence the waiter will either find the commit sequence on the + * committing list or the sequence number will be unchanged and the CIL + * still dirty. In that latter case, the push has not yet started, and + * so the waiter will have to continue trying to check the CIL + * committing list until it is found. In extreme cases of delay, the + * sequence may fully commit between the attempts the wait makes to wait + * on the commit sequence. + */ + list_add(&ctx->committing, &cil->xc_committing); + spin_unlock(&cil->xc_push_lock); + + /* + * pull all the log vectors off the items in the CIL, and + * remove the items from the CIL. We don't need the CIL lock + * here because it's only needed on the transaction commit + * side which is currently locked out by the flush lock. + */ + lv = NULL; + num_iovecs = 0; + while (!list_empty(&cil->xc_cil)) { + struct xfs_log_item *item; + + item = list_first_entry(&cil->xc_cil, + struct xfs_log_item, li_cil); + list_del_init(&item->li_cil); + if (!ctx->lv_chain) + ctx->lv_chain = item->li_lv; + else + lv->lv_next = item->li_lv; + lv = item->li_lv; + item->li_lv = NULL; + num_iovecs += lv->lv_niovecs; + } + + /* + * initialise the new context and attach it to the CIL. Then attach + * the current context to the CIL committing lsit so it can be found + * during log forces to extract the commit lsn of the sequence that + * needs to be forced. + */ + INIT_LIST_HEAD(&new_ctx->committing); + INIT_LIST_HEAD(&new_ctx->busy_extents); + new_ctx->sequence = ctx->sequence + 1; + new_ctx->cil = cil; + cil->xc_ctx = new_ctx; + + /* + * The switch is now done, so we can drop the context lock and move out + * of a shared context. We can't just go straight to the commit record, + * though - we need to synchronise with previous and future commits so + * that the commit records are correctly ordered in the log to ensure + * that we process items during log IO completion in the correct order. + * + * For example, if we get an EFI in one checkpoint and the EFD in the + * next (e.g. due to log forces), we do not want the checkpoint with + * the EFD to be committed before the checkpoint with the EFI. Hence + * we must strictly order the commit records of the checkpoints so + * that: a) the checkpoint callbacks are attached to the iclogs in the + * correct order; and b) the checkpoints are replayed in correct order + * in log recovery. + * + * Hence we need to add this context to the committing context list so + * that higher sequences will wait for us to write out a commit record + * before they do. + * + * xfs_log_force_lsn requires us to mirror the new sequence into the cil + * structure atomically with the addition of this sequence to the + * committing list. This also ensures that we can do unlocked checks + * against the current sequence in log forces without risking + * deferencing a freed context pointer. + */ + spin_lock(&cil->xc_push_lock); + cil->xc_current_sequence = new_ctx->sequence; + spin_unlock(&cil->xc_push_lock); + up_write(&cil->xc_ctx_lock); + + /* + * Build a checkpoint transaction header and write it to the log to + * begin the transaction. We need to account for the space used by the + * transaction header here as it is not accounted for in xlog_write(). + * + * The LSN we need to pass to the log items on transaction commit is + * the LSN reported by the first log vector write. If we use the commit + * record lsn then we can move the tail beyond the grant write head. + */ + tic = ctx->ticket; + thdr.th_magic = XFS_TRANS_HEADER_MAGIC; + thdr.th_type = XFS_TRANS_CHECKPOINT; + thdr.th_tid = tic->t_tid; + thdr.th_num_items = num_iovecs; + lhdr.i_addr = &thdr; + lhdr.i_len = sizeof(xfs_trans_header_t); + lhdr.i_type = XLOG_REG_TYPE_TRANSHDR; + tic->t_curr_res -= lhdr.i_len + sizeof(xlog_op_header_t); + + lvhdr.lv_niovecs = 1; + lvhdr.lv_iovecp = &lhdr; + lvhdr.lv_next = ctx->lv_chain; + + error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0); + if (error) + goto out_abort_free_ticket; + + /* + * now that we've written the checkpoint into the log, strictly + * order the commit records so replay will get them in the right order. + */ +restart: + spin_lock(&cil->xc_push_lock); + list_for_each_entry(new_ctx, &cil->xc_committing, committing) { + /* + * Avoid getting stuck in this loop because we were woken by the + * shutdown, but then went back to sleep once already in the + * shutdown state. + */ + if (XLOG_FORCED_SHUTDOWN(log)) { + spin_unlock(&cil->xc_push_lock); + goto out_abort_free_ticket; + } + + /* + * Higher sequences will wait for this one so skip them. + * Don't wait for our own sequence, either. + */ + if (new_ctx->sequence >= ctx->sequence) + continue; + if (!new_ctx->commit_lsn) { + /* + * It is still being pushed! Wait for the push to + * complete, then start again from the beginning. + */ + xlog_wait(&cil->xc_commit_wait, &cil->xc_push_lock); + goto restart; + } + } + spin_unlock(&cil->xc_push_lock); + + /* xfs_log_done always frees the ticket on error. */ + commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, 0); + if (commit_lsn == -1) + goto out_abort; + + /* attach all the transactions w/ busy extents to iclog */ + ctx->log_cb.cb_func = xlog_cil_committed; + ctx->log_cb.cb_arg = ctx; + error = xfs_log_notify(log->l_mp, commit_iclog, &ctx->log_cb); + if (error) + goto out_abort; + + /* + * now the checkpoint commit is complete and we've attached the + * callbacks to the iclog we can assign the commit LSN to the context + * and wake up anyone who is waiting for the commit to complete. + */ + spin_lock(&cil->xc_push_lock); + ctx->commit_lsn = commit_lsn; + wake_up_all(&cil->xc_commit_wait); + spin_unlock(&cil->xc_push_lock); + + /* release the hounds! */ + return xfs_log_release_iclog(log->l_mp, commit_iclog); + +out_skip: + up_write(&cil->xc_ctx_lock); + xfs_log_ticket_put(new_ctx->ticket); + kmem_free(new_ctx); + return 0; + +out_abort_free_ticket: + xfs_log_ticket_put(tic); +out_abort: + xlog_cil_committed(ctx, XFS_LI_ABORTED); + return -EIO; +} + +static void +xlog_cil_push_work( + struct work_struct *work) +{ + struct xfs_cil *cil = container_of(work, struct xfs_cil, + xc_push_work); + xlog_cil_push(cil->xc_log); +} + +/* + * We need to push CIL every so often so we don't cache more than we can fit in + * the log. The limit really is that a checkpoint can't be more than half the + * log (the current checkpoint is not allowed to overwrite the previous + * checkpoint), but commit latency and memory usage limit this to a smaller + * size. + */ +static void +xlog_cil_push_background( + struct xlog *log) +{ + struct xfs_cil *cil = log->l_cilp; + + /* + * The cil won't be empty because we are called while holding the + * context lock so whatever we added to the CIL will still be there + */ + ASSERT(!list_empty(&cil->xc_cil)); + + /* + * don't do a background push if we haven't used up all the + * space available yet. + */ + if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log)) + return; + + spin_lock(&cil->xc_push_lock); + if (cil->xc_push_seq < cil->xc_current_sequence) { + cil->xc_push_seq = cil->xc_current_sequence; + queue_work(log->l_mp->m_cil_workqueue, &cil->xc_push_work); + } + spin_unlock(&cil->xc_push_lock); + +} + +/* + * xlog_cil_push_now() is used to trigger an immediate CIL push to the sequence + * number that is passed. When it returns, the work will be queued for + * @push_seq, but it won't be completed. The caller is expected to do any + * waiting for push_seq to complete if it is required. + */ +static void +xlog_cil_push_now( + struct xlog *log, + xfs_lsn_t push_seq) +{ + struct xfs_cil *cil = log->l_cilp; + + if (!cil) + return; + + ASSERT(push_seq && push_seq <= cil->xc_current_sequence); + + /* start on any pending background push to minimise wait time on it */ + flush_work(&cil->xc_push_work); + + /* + * If the CIL is empty or we've already pushed the sequence then + * there's no work we need to do. + */ + spin_lock(&cil->xc_push_lock); + if (list_empty(&cil->xc_cil) || push_seq <= cil->xc_push_seq) { + spin_unlock(&cil->xc_push_lock); + return; + } + + cil->xc_push_seq = push_seq; + queue_work(log->l_mp->m_cil_workqueue, &cil->xc_push_work); + spin_unlock(&cil->xc_push_lock); +} + +bool +xlog_cil_empty( + struct xlog *log) +{ + struct xfs_cil *cil = log->l_cilp; + bool empty = false; + + spin_lock(&cil->xc_push_lock); + if (list_empty(&cil->xc_cil)) + empty = true; + spin_unlock(&cil->xc_push_lock); + return empty; +} + +/* + * Commit a transaction with the given vector to the Committed Item List. + * + * To do this, we need to format the item, pin it in memory if required and + * account for the space used by the transaction. Once we have done that we + * need to release the unused reservation for the transaction, attach the + * transaction to the checkpoint context so we carry the busy extents through + * to checkpoint completion, and then unlock all the items in the transaction. + * + * Called with the context lock already held in read mode to lock out + * background commit, returns without it held once background commits are + * allowed again. + */ +void +xfs_log_commit_cil( + struct xfs_mount *mp, + struct xfs_trans *tp, + xfs_lsn_t *commit_lsn, + int flags) +{ + struct xlog *log = mp->m_log; + struct xfs_cil *cil = log->l_cilp; + int log_flags = 0; + + if (flags & XFS_TRANS_RELEASE_LOG_RES) + log_flags = XFS_LOG_REL_PERM_RESERV; + + /* lock out background commit */ + down_read(&cil->xc_ctx_lock); + + xlog_cil_insert_items(log, tp); + + /* check we didn't blow the reservation */ + if (tp->t_ticket->t_curr_res < 0) + xlog_print_tic_res(mp, tp->t_ticket); + + tp->t_commit_lsn = cil->xc_ctx->sequence; + if (commit_lsn) + *commit_lsn = tp->t_commit_lsn; + + xfs_log_done(mp, tp->t_ticket, NULL, log_flags); + xfs_trans_unreserve_and_mod_sb(tp); + + /* + * Once all the items of the transaction have been copied to the CIL, + * the items can be unlocked and freed. + * + * This needs to be done before we drop the CIL context lock because we + * have to update state in the log items and unlock them before they go + * to disk. If we don't, then the CIL checkpoint can race with us and + * we can run checkpoint completion before we've updated and unlocked + * the log items. This affects (at least) processing of stale buffers, + * inodes and EFIs. + */ + xfs_trans_free_items(tp, tp->t_commit_lsn, 0); + + xlog_cil_push_background(log); + + up_read(&cil->xc_ctx_lock); +} + +/* + * Conditionally push the CIL based on the sequence passed in. + * + * We only need to push if we haven't already pushed the sequence + * number given. Hence the only time we will trigger a push here is + * if the push sequence is the same as the current context. + * + * We return the current commit lsn to allow the callers to determine if a + * iclog flush is necessary following this call. + */ +xfs_lsn_t +xlog_cil_force_lsn( + struct xlog *log, + xfs_lsn_t sequence) +{ + struct xfs_cil *cil = log->l_cilp; + struct xfs_cil_ctx *ctx; + xfs_lsn_t commit_lsn = NULLCOMMITLSN; + + ASSERT(sequence <= cil->xc_current_sequence); + + /* + * check to see if we need to force out the current context. + * xlog_cil_push() handles racing pushes for the same sequence, + * so no need to deal with it here. + */ +restart: + xlog_cil_push_now(log, sequence); + + /* + * See if we can find a previous sequence still committing. + * We need to wait for all previous sequence commits to complete + * before allowing the force of push_seq to go ahead. Hence block + * on commits for those as well. + */ + spin_lock(&cil->xc_push_lock); + list_for_each_entry(ctx, &cil->xc_committing, committing) { + /* + * Avoid getting stuck in this loop because we were woken by the + * shutdown, but then went back to sleep once already in the + * shutdown state. + */ + if (XLOG_FORCED_SHUTDOWN(log)) + goto out_shutdown; + if (ctx->sequence > sequence) + continue; + if (!ctx->commit_lsn) { + /* + * It is still being pushed! Wait for the push to + * complete, then start again from the beginning. + */ + xlog_wait(&cil->xc_commit_wait, &cil->xc_push_lock); + goto restart; + } + if (ctx->sequence != sequence) + continue; + /* found it! */ + commit_lsn = ctx->commit_lsn; + } + + /* + * The call to xlog_cil_push_now() executes the push in the background. + * Hence by the time we have got here it our sequence may not have been + * pushed yet. This is true if the current sequence still matches the + * push sequence after the above wait loop and the CIL still contains + * dirty objects. This is guaranteed by the push code first adding the + * context to the committing list before emptying the CIL. + * + * Hence if we don't find the context in the committing list and the + * current sequence number is unchanged then the CIL contents are + * significant. If the CIL is empty, if means there was nothing to push + * and that means there is nothing to wait for. If the CIL is not empty, + * it means we haven't yet started the push, because if it had started + * we would have found the context on the committing list. + */ + if (sequence == cil->xc_current_sequence && + !list_empty(&cil->xc_cil)) { + spin_unlock(&cil->xc_push_lock); + goto restart; + } + + spin_unlock(&cil->xc_push_lock); + return commit_lsn; + + /* + * We detected a shutdown in progress. We need to trigger the log force + * to pass through it's iclog state machine error handling, even though + * we are already in a shutdown state. Hence we can't return + * NULLCOMMITLSN here as that has special meaning to log forces (i.e. + * LSN is already stable), so we return a zero LSN instead. + */ +out_shutdown: + spin_unlock(&cil->xc_push_lock); + return 0; +} + +/* + * Check if the current log item was first committed in this sequence. + * We can't rely on just the log item being in the CIL, we have to check + * the recorded commit sequence number. + * + * Note: for this to be used in a non-racy manner, it has to be called with + * CIL flushing locked out. As a result, it should only be used during the + * transaction commit process when deciding what to format into the item. + */ +bool +xfs_log_item_in_current_chkpt( + struct xfs_log_item *lip) +{ + struct xfs_cil_ctx *ctx; + + if (list_empty(&lip->li_cil)) + return false; + + ctx = lip->li_mountp->m_log->l_cilp->xc_ctx; + + /* + * li_seq is written on the first commit of a log item to record the + * first checkpoint it is written to. Hence if it is different to the + * current sequence, we're in a new checkpoint. + */ + if (XFS_LSN_CMP(lip->li_seq, ctx->sequence) != 0) + return false; + return true; +} + +/* + * Perform initial CIL structure initialisation. + */ +int +xlog_cil_init( + struct xlog *log) +{ + struct xfs_cil *cil; + struct xfs_cil_ctx *ctx; + + cil = kmem_zalloc(sizeof(*cil), KM_SLEEP|KM_MAYFAIL); + if (!cil) + return -ENOMEM; + + ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP|KM_MAYFAIL); + if (!ctx) { + kmem_free(cil); + return -ENOMEM; + } + + INIT_WORK(&cil->xc_push_work, xlog_cil_push_work); + INIT_LIST_HEAD(&cil->xc_cil); + INIT_LIST_HEAD(&cil->xc_committing); + spin_lock_init(&cil->xc_cil_lock); + spin_lock_init(&cil->xc_push_lock); + init_rwsem(&cil->xc_ctx_lock); + init_waitqueue_head(&cil->xc_commit_wait); + + INIT_LIST_HEAD(&ctx->committing); + INIT_LIST_HEAD(&ctx->busy_extents); + ctx->sequence = 1; + ctx->cil = cil; + cil->xc_ctx = ctx; + cil->xc_current_sequence = ctx->sequence; + + cil->xc_log = log; + log->l_cilp = cil; + return 0; +} + +void +xlog_cil_destroy( + struct xlog *log) +{ + if (log->l_cilp->xc_ctx) { + if (log->l_cilp->xc_ctx->ticket) + xfs_log_ticket_put(log->l_cilp->xc_ctx->ticket); + kmem_free(log->l_cilp->xc_ctx); + } + + ASSERT(list_empty(&log->l_cilp->xc_cil)); + kmem_free(log->l_cilp); +} + diff --git a/kernel/fs/xfs/xfs_log_priv.h b/kernel/fs/xfs/xfs_log_priv.h new file mode 100644 index 000000000..db7cbdeb2 --- /dev/null +++ b/kernel/fs/xfs/xfs_log_priv.h @@ -0,0 +1,561 @@ +/* + * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_LOG_PRIV_H__ +#define __XFS_LOG_PRIV_H__ + +struct xfs_buf; +struct xlog; +struct xlog_ticket; +struct xfs_mount; +struct xfs_log_callback; + +/* + * Flags for log structure + */ +#define XLOG_ACTIVE_RECOVERY 0x2 /* in the middle of recovery */ +#define XLOG_RECOVERY_NEEDED 0x4 /* log was recovered */ +#define XLOG_IO_ERROR 0x8 /* log hit an I/O error, and being + shutdown */ +#define XLOG_TAIL_WARN 0x10 /* log tail verify warning issued */ + +/* + * get client id from packed copy. + * + * this hack is here because the xlog_pack code copies four bytes + * of xlog_op_header containing the fields oh_clientid, oh_flags + * and oh_res2 into the packed copy. + * + * later on this four byte chunk is treated as an int and the + * client id is pulled out. + * + * this has endian issues, of course. + */ +static inline uint xlog_get_client_id(__be32 i) +{ + return be32_to_cpu(i) >> 24; +} + +/* + * In core log state + */ +#define XLOG_STATE_ACTIVE 0x0001 /* Current IC log being written to */ +#define XLOG_STATE_WANT_SYNC 0x0002 /* Want to sync this iclog; no more writes */ +#define XLOG_STATE_SYNCING 0x0004 /* This IC log is syncing */ +#define XLOG_STATE_DONE_SYNC 0x0008 /* Done syncing to disk */ +#define XLOG_STATE_DO_CALLBACK \ + 0x0010 /* Process callback functions */ +#define XLOG_STATE_CALLBACK 0x0020 /* Callback functions now */ +#define XLOG_STATE_DIRTY 0x0040 /* Dirty IC log, not ready for ACTIVE status*/ +#define XLOG_STATE_IOERROR 0x0080 /* IO error happened in sync'ing log */ +#define XLOG_STATE_ALL 0x7FFF /* All possible valid flags */ +#define XLOG_STATE_NOTUSED 0x8000 /* This IC log not being used */ + +/* + * Flags to log ticket + */ +#define XLOG_TIC_INITED 0x1 /* has been initialized */ +#define XLOG_TIC_PERM_RESERV 0x2 /* permanent reservation */ + +#define XLOG_TIC_FLAGS \ + { XLOG_TIC_INITED, "XLOG_TIC_INITED" }, \ + { XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" } + +/* + * Below are states for covering allocation transactions. + * By covering, we mean changing the h_tail_lsn in the last on-disk + * log write such that no allocation transactions will be re-done during + * recovery after a system crash. Recovery starts at the last on-disk + * log write. + * + * These states are used to insert dummy log entries to cover + * space allocation transactions which can undo non-transactional changes + * after a crash. Writes to a file with space + * already allocated do not result in any transactions. Allocations + * might include space beyond the EOF. So if we just push the EOF a + * little, the last transaction for the file could contain the wrong + * size. If there is no file system activity, after an allocation + * transaction, and the system crashes, the allocation transaction + * will get replayed and the file will be truncated. This could + * be hours/days/... after the allocation occurred. + * + * The fix for this is to do two dummy transactions when the + * system is idle. We need two dummy transaction because the h_tail_lsn + * in the log record header needs to point beyond the last possible + * non-dummy transaction. The first dummy changes the h_tail_lsn to + * the first transaction before the dummy. The second dummy causes + * h_tail_lsn to point to the first dummy. Recovery starts at h_tail_lsn. + * + * These dummy transactions get committed when everything + * is idle (after there has been some activity). + * + * There are 5 states used to control this. + * + * IDLE -- no logging has been done on the file system or + * we are done covering previous transactions. + * NEED -- logging has occurred and we need a dummy transaction + * when the log becomes idle. + * DONE -- we were in the NEED state and have committed a dummy + * transaction. + * NEED2 -- we detected that a dummy transaction has gone to the + * on disk log with no other transactions. + * DONE2 -- we committed a dummy transaction when in the NEED2 state. + * + * There are two places where we switch states: + * + * 1.) In xfs_sync, when we detect an idle log and are in NEED or NEED2. + * We commit the dummy transaction and switch to DONE or DONE2, + * respectively. In all other states, we don't do anything. + * + * 2.) When we finish writing the on-disk log (xlog_state_clean_log). + * + * No matter what state we are in, if this isn't the dummy + * transaction going out, the next state is NEED. + * So, if we aren't in the DONE or DONE2 states, the next state + * is NEED. We can't be finishing a write of the dummy record + * unless it was committed and the state switched to DONE or DONE2. + * + * If we are in the DONE state and this was a write of the + * dummy transaction, we move to NEED2. + * + * If we are in the DONE2 state and this was a write of the + * dummy transaction, we move to IDLE. + * + * + * Writing only one dummy transaction can get appended to + * one file space allocation. When this happens, the log recovery + * code replays the space allocation and a file could be truncated. + * This is why we have the NEED2 and DONE2 states before going idle. + */ + +#define XLOG_STATE_COVER_IDLE 0 +#define XLOG_STATE_COVER_NEED 1 +#define XLOG_STATE_COVER_DONE 2 +#define XLOG_STATE_COVER_NEED2 3 +#define XLOG_STATE_COVER_DONE2 4 + +#define XLOG_COVER_OPS 5 + +/* Ticket reservation region accounting */ +#define XLOG_TIC_LEN_MAX 15 + +/* + * Reservation region + * As would be stored in xfs_log_iovec but without the i_addr which + * we don't care about. + */ +typedef struct xlog_res { + uint r_len; /* region length :4 */ + uint r_type; /* region's transaction type :4 */ +} xlog_res_t; + +typedef struct xlog_ticket { + struct list_head t_queue; /* reserve/write queue */ + struct task_struct *t_task; /* task that owns this ticket */ + xlog_tid_t t_tid; /* transaction identifier : 4 */ + atomic_t t_ref; /* ticket reference count : 4 */ + int t_curr_res; /* current reservation in bytes : 4 */ + int t_unit_res; /* unit reservation in bytes : 4 */ + char t_ocnt; /* original count : 1 */ + char t_cnt; /* current count : 1 */ + char t_clientid; /* who does this belong to; : 1 */ + char t_flags; /* properties of reservation : 1 */ + uint t_trans_type; /* transaction type : 4 */ + + /* reservation array fields */ + uint t_res_num; /* num in array : 4 */ + uint t_res_num_ophdrs; /* num op hdrs : 4 */ + uint t_res_arr_sum; /* array sum : 4 */ + uint t_res_o_flow; /* sum overflow : 4 */ + xlog_res_t t_res_arr[XLOG_TIC_LEN_MAX]; /* array of res : 8 * 15 */ +} xlog_ticket_t; + +/* + * - A log record header is 512 bytes. There is plenty of room to grow the + * xlog_rec_header_t into the reserved space. + * - ic_data follows, so a write to disk can start at the beginning of + * the iclog. + * - ic_forcewait is used to implement synchronous forcing of the iclog to disk. + * - ic_next is the pointer to the next iclog in the ring. + * - ic_bp is a pointer to the buffer used to write this incore log to disk. + * - ic_log is a pointer back to the global log structure. + * - ic_callback is a linked list of callback function/argument pairs to be + * called after an iclog finishes writing. + * - ic_size is the full size of the header plus data. + * - ic_offset is the current number of bytes written to in this iclog. + * - ic_refcnt is bumped when someone is writing to the log. + * - ic_state is the state of the iclog. + * + * Because of cacheline contention on large machines, we need to separate + * various resources onto different cachelines. To start with, make the + * structure cacheline aligned. The following fields can be contended on + * by independent processes: + * + * - ic_callback_* + * - ic_refcnt + * - fields protected by the global l_icloglock + * + * so we need to ensure that these fields are located in separate cachelines. + * We'll put all the read-only and l_icloglock fields in the first cacheline, + * and move everything else out to subsequent cachelines. + */ +typedef struct xlog_in_core { + wait_queue_head_t ic_force_wait; + wait_queue_head_t ic_write_wait; + struct xlog_in_core *ic_next; + struct xlog_in_core *ic_prev; + struct xfs_buf *ic_bp; + struct xlog *ic_log; + int ic_size; + int ic_offset; + int ic_bwritecnt; + unsigned short ic_state; + char *ic_datap; /* pointer to iclog data */ + + /* Callback structures need their own cacheline */ + spinlock_t ic_callback_lock ____cacheline_aligned_in_smp; + struct xfs_log_callback *ic_callback; + struct xfs_log_callback **ic_callback_tail; + + /* reference counts need their own cacheline */ + atomic_t ic_refcnt ____cacheline_aligned_in_smp; + xlog_in_core_2_t *ic_data; +#define ic_header ic_data->hic_header +} xlog_in_core_t; + +/* + * The CIL context is used to aggregate per-transaction details as well be + * passed to the iclog for checkpoint post-commit processing. After being + * passed to the iclog, another context needs to be allocated for tracking the + * next set of transactions to be aggregated into a checkpoint. + */ +struct xfs_cil; + +struct xfs_cil_ctx { + struct xfs_cil *cil; + xfs_lsn_t sequence; /* chkpt sequence # */ + xfs_lsn_t start_lsn; /* first LSN of chkpt commit */ + xfs_lsn_t commit_lsn; /* chkpt commit record lsn */ + struct xlog_ticket *ticket; /* chkpt ticket */ + int nvecs; /* number of regions */ + int space_used; /* aggregate size of regions */ + struct list_head busy_extents; /* busy extents in chkpt */ + struct xfs_log_vec *lv_chain; /* logvecs being pushed */ + struct xfs_log_callback log_cb; /* completion callback hook. */ + struct list_head committing; /* ctx committing list */ +}; + +/* + * Committed Item List structure + * + * This structure is used to track log items that have been committed but not + * yet written into the log. It is used only when the delayed logging mount + * option is enabled. + * + * This structure tracks the list of committing checkpoint contexts so + * we can avoid the problem of having to hold out new transactions during a + * flush until we have a the commit record LSN of the checkpoint. We can + * traverse the list of committing contexts in xlog_cil_push_lsn() to find a + * sequence match and extract the commit LSN directly from there. If the + * checkpoint is still in the process of committing, we can block waiting for + * the commit LSN to be determined as well. This should make synchronous + * operations almost as efficient as the old logging methods. + */ +struct xfs_cil { + struct xlog *xc_log; + struct list_head xc_cil; + spinlock_t xc_cil_lock; + + struct rw_semaphore xc_ctx_lock ____cacheline_aligned_in_smp; + struct xfs_cil_ctx *xc_ctx; + + spinlock_t xc_push_lock ____cacheline_aligned_in_smp; + xfs_lsn_t xc_push_seq; + struct list_head xc_committing; + wait_queue_head_t xc_commit_wait; + xfs_lsn_t xc_current_sequence; + struct work_struct xc_push_work; +} ____cacheline_aligned_in_smp; + +/* + * The amount of log space we allow the CIL to aggregate is difficult to size. + * Whatever we choose, we have to make sure we can get a reservation for the + * log space effectively, that it is large enough to capture sufficient + * relogging to reduce log buffer IO significantly, but it is not too large for + * the log or induces too much latency when writing out through the iclogs. We + * track both space consumed and the number of vectors in the checkpoint + * context, so we need to decide which to use for limiting. + * + * Every log buffer we write out during a push needs a header reserved, which + * is at least one sector and more for v2 logs. Hence we need a reservation of + * at least 512 bytes per 32k of log space just for the LR headers. That means + * 16KB of reservation per megabyte of delayed logging space we will consume, + * plus various headers. The number of headers will vary based on the num of + * io vectors, so limiting on a specific number of vectors is going to result + * in transactions of varying size. IOWs, it is more consistent to track and + * limit space consumed in the log rather than by the number of objects being + * logged in order to prevent checkpoint ticket overruns. + * + * Further, use of static reservations through the log grant mechanism is + * problematic. It introduces a lot of complexity (e.g. reserve grant vs write + * grant) and a significant deadlock potential because regranting write space + * can block on log pushes. Hence if we have to regrant log space during a log + * push, we can deadlock. + * + * However, we can avoid this by use of a dynamic "reservation stealing" + * technique during transaction commit whereby unused reservation space in the + * transaction ticket is transferred to the CIL ctx commit ticket to cover the + * space needed by the checkpoint transaction. This means that we never need to + * specifically reserve space for the CIL checkpoint transaction, nor do we + * need to regrant space once the checkpoint completes. This also means the + * checkpoint transaction ticket is specific to the checkpoint context, rather + * than the CIL itself. + * + * With dynamic reservations, we can effectively make up arbitrary limits for + * the checkpoint size so long as they don't violate any other size rules. + * Recovery imposes a rule that no transaction exceed half the log, so we are + * limited by that. Furthermore, the log transaction reservation subsystem + * tries to keep 25% of the log free, so we need to keep below that limit or we + * risk running out of free log space to start any new transactions. + * + * In order to keep background CIL push efficient, we will set a lower + * threshold at which background pushing is attempted without blocking current + * transaction commits. A separate, higher bound defines when CIL pushes are + * enforced to ensure we stay within our maximum checkpoint size bounds. + * threshold, yet give us plenty of space for aggregation on large logs. + */ +#define XLOG_CIL_SPACE_LIMIT(log) (log->l_logsize >> 3) + +/* + * ticket grant locks, queues and accounting have their own cachlines + * as these are quite hot and can be operated on concurrently. + */ +struct xlog_grant_head { + spinlock_t lock ____cacheline_aligned_in_smp; + struct list_head waiters; + atomic64_t grant; +}; + +/* + * The reservation head lsn is not made up of a cycle number and block number. + * Instead, it uses a cycle number and byte number. Logs don't expect to + * overflow 31 bits worth of byte offset, so using a byte number will mean + * that round off problems won't occur when releasing partial reservations. + */ +struct xlog { + /* The following fields don't need locking */ + struct xfs_mount *l_mp; /* mount point */ + struct xfs_ail *l_ailp; /* AIL log is working with */ + struct xfs_cil *l_cilp; /* CIL log is working with */ + struct xfs_buf *l_xbuf; /* extra buffer for log + * wrapping */ + struct xfs_buftarg *l_targ; /* buftarg of log */ + struct delayed_work l_work; /* background flush work */ + uint l_flags; + uint l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */ + struct list_head *l_buf_cancel_table; + int l_iclog_hsize; /* size of iclog header */ + int l_iclog_heads; /* # of iclog header sectors */ + uint l_sectBBsize; /* sector size in BBs (2^n) */ + int l_iclog_size; /* size of log in bytes */ + int l_iclog_size_log; /* log power size of log */ + int l_iclog_bufs; /* number of iclog buffers */ + xfs_daddr_t l_logBBstart; /* start block of log */ + int l_logsize; /* size of log in bytes */ + int l_logBBsize; /* size of log in BB chunks */ + + /* The following block of fields are changed while holding icloglock */ + wait_queue_head_t l_flush_wait ____cacheline_aligned_in_smp; + /* waiting for iclog flush */ + int l_covered_state;/* state of "covering disk + * log entries" */ + xlog_in_core_t *l_iclog; /* head log queue */ + spinlock_t l_icloglock; /* grab to change iclog state */ + int l_curr_cycle; /* Cycle number of log writes */ + int l_prev_cycle; /* Cycle number before last + * block increment */ + int l_curr_block; /* current logical log block */ + int l_prev_block; /* previous logical log block */ + + /* + * l_last_sync_lsn and l_tail_lsn are atomics so they can be set and + * read without needing to hold specific locks. To avoid operations + * contending with other hot objects, place each of them on a separate + * cacheline. + */ + /* lsn of last LR on disk */ + atomic64_t l_last_sync_lsn ____cacheline_aligned_in_smp; + /* lsn of 1st LR with unflushed * buffers */ + atomic64_t l_tail_lsn ____cacheline_aligned_in_smp; + + struct xlog_grant_head l_reserve_head; + struct xlog_grant_head l_write_head; + + struct xfs_kobj l_kobj; + + /* The following field are used for debugging; need to hold icloglock */ +#ifdef DEBUG + char *l_iclog_bak[XLOG_MAX_ICLOGS]; +#endif + +}; + +#define XLOG_BUF_CANCEL_BUCKET(log, blkno) \ + ((log)->l_buf_cancel_table + ((__uint64_t)blkno % XLOG_BC_TABLE_SIZE)) + +#define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR) + +/* common routines */ +extern int +xlog_recover( + struct xlog *log); +extern int +xlog_recover_finish( + struct xlog *log); + +extern __le32 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead, + char *dp, int size); + +extern kmem_zone_t *xfs_log_ticket_zone; +struct xlog_ticket * +xlog_ticket_alloc( + struct xlog *log, + int unit_bytes, + int count, + char client, + bool permanent, + xfs_km_flags_t alloc_flags); + + +static inline void +xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes) +{ + *ptr += bytes; + *len -= bytes; + *off += bytes; +} + +void xlog_print_tic_res(struct xfs_mount *mp, struct xlog_ticket *ticket); +int +xlog_write( + struct xlog *log, + struct xfs_log_vec *log_vector, + struct xlog_ticket *tic, + xfs_lsn_t *start_lsn, + struct xlog_in_core **commit_iclog, + uint flags); + +/* + * When we crack an atomic LSN, we sample it first so that the value will not + * change while we are cracking it into the component values. This means we + * will always get consistent component values to work from. This should always + * be used to sample and crack LSNs that are stored and updated in atomic + * variables. + */ +static inline void +xlog_crack_atomic_lsn(atomic64_t *lsn, uint *cycle, uint *block) +{ + xfs_lsn_t val = atomic64_read(lsn); + + *cycle = CYCLE_LSN(val); + *block = BLOCK_LSN(val); +} + +/* + * Calculate and assign a value to an atomic LSN variable from component pieces. + */ +static inline void +xlog_assign_atomic_lsn(atomic64_t *lsn, uint cycle, uint block) +{ + atomic64_set(lsn, xlog_assign_lsn(cycle, block)); +} + +/* + * When we crack the grant head, we sample it first so that the value will not + * change while we are cracking it into the component values. This means we + * will always get consistent component values to work from. + */ +static inline void +xlog_crack_grant_head_val(int64_t val, int *cycle, int *space) +{ + *cycle = val >> 32; + *space = val & 0xffffffff; +} + +static inline void +xlog_crack_grant_head(atomic64_t *head, int *cycle, int *space) +{ + xlog_crack_grant_head_val(atomic64_read(head), cycle, space); +} + +static inline int64_t +xlog_assign_grant_head_val(int cycle, int space) +{ + return ((int64_t)cycle << 32) | space; +} + +static inline void +xlog_assign_grant_head(atomic64_t *head, int cycle, int space) +{ + atomic64_set(head, xlog_assign_grant_head_val(cycle, space)); +} + +/* + * Committed Item List interfaces + */ +int xlog_cil_init(struct xlog *log); +void xlog_cil_init_post_recovery(struct xlog *log); +void xlog_cil_destroy(struct xlog *log); +bool xlog_cil_empty(struct xlog *log); + +/* + * CIL force routines + */ +xfs_lsn_t +xlog_cil_force_lsn( + struct xlog *log, + xfs_lsn_t sequence); + +static inline void +xlog_cil_force(struct xlog *log) +{ + xlog_cil_force_lsn(log, log->l_cilp->xc_current_sequence); +} + +/* + * Unmount record type is used as a pseudo transaction type for the ticket. + * It's value must be outside the range of XFS_TRANS_* values. + */ +#define XLOG_UNMOUNT_REC_TYPE (-1U) + +/* + * Wrapper function for waiting on a wait queue serialised against wakeups + * by a spinlock. This matches the semantics of all the wait queues used in the + * log code. + */ +static inline void xlog_wait(wait_queue_head_t *wq, spinlock_t *lock) +{ + DECLARE_WAITQUEUE(wait, current); + + add_wait_queue_exclusive(wq, &wait); + __set_current_state(TASK_UNINTERRUPTIBLE); + spin_unlock(lock); + schedule(); + remove_wait_queue(wq, &wait); +} + +#endif /* __XFS_LOG_PRIV_H__ */ diff --git a/kernel/fs/xfs/xfs_log_recover.c b/kernel/fs/xfs/xfs_log_recover.c new file mode 100644 index 000000000..4f5784f85 --- /dev/null +++ b/kernel/fs/xfs/xfs_log_recover.c @@ -0,0 +1,4651 @@ +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_log.h" +#include "xfs_log_priv.h" +#include "xfs_log_recover.h" +#include "xfs_inode_item.h" +#include "xfs_extfree_item.h" +#include "xfs_trans_priv.h" +#include "xfs_alloc.h" +#include "xfs_ialloc.h" +#include "xfs_quota.h" +#include "xfs_cksum.h" +#include "xfs_trace.h" +#include "xfs_icache.h" +#include "xfs_bmap_btree.h" +#include "xfs_error.h" +#include "xfs_dir2.h" + +#define BLK_AVG(blk1, blk2) ((blk1+blk2) >> 1) + +STATIC int +xlog_find_zeroed( + struct xlog *, + xfs_daddr_t *); +STATIC int +xlog_clear_stale_blocks( + struct xlog *, + xfs_lsn_t); +#if defined(DEBUG) +STATIC void +xlog_recover_check_summary( + struct xlog *); +#else +#define xlog_recover_check_summary(log) +#endif + +/* + * This structure is used during recovery to record the buf log items which + * have been canceled and should not be replayed. + */ +struct xfs_buf_cancel { + xfs_daddr_t bc_blkno; + uint bc_len; + int bc_refcount; + struct list_head bc_list; +}; + +/* + * Sector aligned buffer routines for buffer create/read/write/access + */ + +/* + * Verify the given count of basic blocks is valid number of blocks + * to specify for an operation involving the given XFS log buffer. + * Returns nonzero if the count is valid, 0 otherwise. + */ + +static inline int +xlog_buf_bbcount_valid( + struct xlog *log, + int bbcount) +{ + return bbcount > 0 && bbcount <= log->l_logBBsize; +} + +/* + * Allocate a buffer to hold log data. The buffer needs to be able + * to map to a range of nbblks basic blocks at any valid (basic + * block) offset within the log. + */ +STATIC xfs_buf_t * +xlog_get_bp( + struct xlog *log, + int nbblks) +{ + struct xfs_buf *bp; + + if (!xlog_buf_bbcount_valid(log, nbblks)) { + xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer", + nbblks); + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp); + return NULL; + } + + /* + * We do log I/O in units of log sectors (a power-of-2 + * multiple of the basic block size), so we round up the + * requested size to accommodate the basic blocks required + * for complete log sectors. + * + * In addition, the buffer may be used for a non-sector- + * aligned block offset, in which case an I/O of the + * requested size could extend beyond the end of the + * buffer. If the requested size is only 1 basic block it + * will never straddle a sector boundary, so this won't be + * an issue. Nor will this be a problem if the log I/O is + * done in basic blocks (sector size 1). But otherwise we + * extend the buffer by one extra log sector to ensure + * there's space to accommodate this possibility. + */ + if (nbblks > 1 && log->l_sectBBsize > 1) + nbblks += log->l_sectBBsize; + nbblks = round_up(nbblks, log->l_sectBBsize); + + bp = xfs_buf_get_uncached(log->l_mp->m_logdev_targp, nbblks, 0); + if (bp) + xfs_buf_unlock(bp); + return bp; +} + +STATIC void +xlog_put_bp( + xfs_buf_t *bp) +{ + xfs_buf_free(bp); +} + +/* + * Return the address of the start of the given block number's data + * in a log buffer. The buffer covers a log sector-aligned region. + */ +STATIC xfs_caddr_t +xlog_align( + struct xlog *log, + xfs_daddr_t blk_no, + int nbblks, + struct xfs_buf *bp) +{ + xfs_daddr_t offset = blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1); + + ASSERT(offset + nbblks <= bp->b_length); + return bp->b_addr + BBTOB(offset); +} + + +/* + * nbblks should be uint, but oh well. Just want to catch that 32-bit length. + */ +STATIC int +xlog_bread_noalign( + struct xlog *log, + xfs_daddr_t blk_no, + int nbblks, + struct xfs_buf *bp) +{ + int error; + + if (!xlog_buf_bbcount_valid(log, nbblks)) { + xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer", + nbblks); + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp); + return -EFSCORRUPTED; + } + + blk_no = round_down(blk_no, log->l_sectBBsize); + nbblks = round_up(nbblks, log->l_sectBBsize); + + ASSERT(nbblks > 0); + ASSERT(nbblks <= bp->b_length); + + XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no); + XFS_BUF_READ(bp); + bp->b_io_length = nbblks; + bp->b_error = 0; + + error = xfs_buf_submit_wait(bp); + if (error && !XFS_FORCED_SHUTDOWN(log->l_mp)) + xfs_buf_ioerror_alert(bp, __func__); + return error; +} + +STATIC int +xlog_bread( + struct xlog *log, + xfs_daddr_t blk_no, + int nbblks, + struct xfs_buf *bp, + xfs_caddr_t *offset) +{ + int error; + + error = xlog_bread_noalign(log, blk_no, nbblks, bp); + if (error) + return error; + + *offset = xlog_align(log, blk_no, nbblks, bp); + return 0; +} + +/* + * Read at an offset into the buffer. Returns with the buffer in it's original + * state regardless of the result of the read. + */ +STATIC int +xlog_bread_offset( + struct xlog *log, + xfs_daddr_t blk_no, /* block to read from */ + int nbblks, /* blocks to read */ + struct xfs_buf *bp, + xfs_caddr_t offset) +{ + xfs_caddr_t orig_offset = bp->b_addr; + int orig_len = BBTOB(bp->b_length); + int error, error2; + + error = xfs_buf_associate_memory(bp, offset, BBTOB(nbblks)); + if (error) + return error; + + error = xlog_bread_noalign(log, blk_no, nbblks, bp); + + /* must reset buffer pointer even on error */ + error2 = xfs_buf_associate_memory(bp, orig_offset, orig_len); + if (error) + return error; + return error2; +} + +/* + * Write out the buffer at the given block for the given number of blocks. + * The buffer is kept locked across the write and is returned locked. + * This can only be used for synchronous log writes. + */ +STATIC int +xlog_bwrite( + struct xlog *log, + xfs_daddr_t blk_no, + int nbblks, + struct xfs_buf *bp) +{ + int error; + + if (!xlog_buf_bbcount_valid(log, nbblks)) { + xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer", + nbblks); + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp); + return -EFSCORRUPTED; + } + + blk_no = round_down(blk_no, log->l_sectBBsize); + nbblks = round_up(nbblks, log->l_sectBBsize); + + ASSERT(nbblks > 0); + ASSERT(nbblks <= bp->b_length); + + XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no); + XFS_BUF_ZEROFLAGS(bp); + xfs_buf_hold(bp); + xfs_buf_lock(bp); + bp->b_io_length = nbblks; + bp->b_error = 0; + + error = xfs_bwrite(bp); + if (error) + xfs_buf_ioerror_alert(bp, __func__); + xfs_buf_relse(bp); + return error; +} + +#ifdef DEBUG +/* + * dump debug superblock and log record information + */ +STATIC void +xlog_header_check_dump( + xfs_mount_t *mp, + xlog_rec_header_t *head) +{ + xfs_debug(mp, "%s: SB : uuid = %pU, fmt = %d", + __func__, &mp->m_sb.sb_uuid, XLOG_FMT); + xfs_debug(mp, " log : uuid = %pU, fmt = %d", + &head->h_fs_uuid, be32_to_cpu(head->h_fmt)); +} +#else +#define xlog_header_check_dump(mp, head) +#endif + +/* + * check log record header for recovery + */ +STATIC int +xlog_header_check_recover( + xfs_mount_t *mp, + xlog_rec_header_t *head) +{ + ASSERT(head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)); + + /* + * IRIX doesn't write the h_fmt field and leaves it zeroed + * (XLOG_FMT_UNKNOWN). This stops us from trying to recover + * a dirty log created in IRIX. + */ + if (unlikely(head->h_fmt != cpu_to_be32(XLOG_FMT))) { + xfs_warn(mp, + "dirty log written in incompatible format - can't recover"); + xlog_header_check_dump(mp, head); + XFS_ERROR_REPORT("xlog_header_check_recover(1)", + XFS_ERRLEVEL_HIGH, mp); + return -EFSCORRUPTED; + } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) { + xfs_warn(mp, + "dirty log entry has mismatched uuid - can't recover"); + xlog_header_check_dump(mp, head); + XFS_ERROR_REPORT("xlog_header_check_recover(2)", + XFS_ERRLEVEL_HIGH, mp); + return -EFSCORRUPTED; + } + return 0; +} + +/* + * read the head block of the log and check the header + */ +STATIC int +xlog_header_check_mount( + xfs_mount_t *mp, + xlog_rec_header_t *head) +{ + ASSERT(head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)); + + if (uuid_is_nil(&head->h_fs_uuid)) { + /* + * IRIX doesn't write the h_fs_uuid or h_fmt fields. If + * h_fs_uuid is nil, we assume this log was last mounted + * by IRIX and continue. + */ + xfs_warn(mp, "nil uuid in log - IRIX style log"); + } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) { + xfs_warn(mp, "log has mismatched uuid - can't recover"); + xlog_header_check_dump(mp, head); + XFS_ERROR_REPORT("xlog_header_check_mount", + XFS_ERRLEVEL_HIGH, mp); + return -EFSCORRUPTED; + } + return 0; +} + +STATIC void +xlog_recover_iodone( + struct xfs_buf *bp) +{ + if (bp->b_error) { + /* + * We're not going to bother about retrying + * this during recovery. One strike! + */ + if (!XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) { + xfs_buf_ioerror_alert(bp, __func__); + xfs_force_shutdown(bp->b_target->bt_mount, + SHUTDOWN_META_IO_ERROR); + } + } + bp->b_iodone = NULL; + xfs_buf_ioend(bp); +} + +/* + * This routine finds (to an approximation) the first block in the physical + * log which contains the given cycle. It uses a binary search algorithm. + * Note that the algorithm can not be perfect because the disk will not + * necessarily be perfect. + */ +STATIC int +xlog_find_cycle_start( + struct xlog *log, + struct xfs_buf *bp, + xfs_daddr_t first_blk, + xfs_daddr_t *last_blk, + uint cycle) +{ + xfs_caddr_t offset; + xfs_daddr_t mid_blk; + xfs_daddr_t end_blk; + uint mid_cycle; + int error; + + end_blk = *last_blk; + mid_blk = BLK_AVG(first_blk, end_blk); + while (mid_blk != first_blk && mid_blk != end_blk) { + error = xlog_bread(log, mid_blk, 1, bp, &offset); + if (error) + return error; + mid_cycle = xlog_get_cycle(offset); + if (mid_cycle == cycle) + end_blk = mid_blk; /* last_half_cycle == mid_cycle */ + else + first_blk = mid_blk; /* first_half_cycle == mid_cycle */ + mid_blk = BLK_AVG(first_blk, end_blk); + } + ASSERT((mid_blk == first_blk && mid_blk+1 == end_blk) || + (mid_blk == end_blk && mid_blk-1 == first_blk)); + + *last_blk = end_blk; + + return 0; +} + +/* + * Check that a range of blocks does not contain stop_on_cycle_no. + * Fill in *new_blk with the block offset where such a block is + * found, or with -1 (an invalid block number) if there is no such + * block in the range. The scan needs to occur from front to back + * and the pointer into the region must be updated since a later + * routine will need to perform another test. + */ +STATIC int +xlog_find_verify_cycle( + struct xlog *log, + xfs_daddr_t start_blk, + int nbblks, + uint stop_on_cycle_no, + xfs_daddr_t *new_blk) +{ + xfs_daddr_t i, j; + uint cycle; + xfs_buf_t *bp; + xfs_daddr_t bufblks; + xfs_caddr_t buf = NULL; + int error = 0; + + /* + * Greedily allocate a buffer big enough to handle the full + * range of basic blocks we'll be examining. If that fails, + * try a smaller size. We need to be able to read at least + * a log sector, or we're out of luck. + */ + bufblks = 1 << ffs(nbblks); + while (bufblks > log->l_logBBsize) + bufblks >>= 1; + while (!(bp = xlog_get_bp(log, bufblks))) { + bufblks >>= 1; + if (bufblks < log->l_sectBBsize) + return -ENOMEM; + } + + for (i = start_blk; i < start_blk + nbblks; i += bufblks) { + int bcount; + + bcount = min(bufblks, (start_blk + nbblks - i)); + + error = xlog_bread(log, i, bcount, bp, &buf); + if (error) + goto out; + + for (j = 0; j < bcount; j++) { + cycle = xlog_get_cycle(buf); + if (cycle == stop_on_cycle_no) { + *new_blk = i+j; + goto out; + } + + buf += BBSIZE; + } + } + + *new_blk = -1; + +out: + xlog_put_bp(bp); + return error; +} + +/* + * Potentially backup over partial log record write. + * + * In the typical case, last_blk is the number of the block directly after + * a good log record. Therefore, we subtract one to get the block number + * of the last block in the given buffer. extra_bblks contains the number + * of blocks we would have read on a previous read. This happens when the + * last log record is split over the end of the physical log. + * + * extra_bblks is the number of blocks potentially verified on a previous + * call to this routine. + */ +STATIC int +xlog_find_verify_log_record( + struct xlog *log, + xfs_daddr_t start_blk, + xfs_daddr_t *last_blk, + int extra_bblks) +{ + xfs_daddr_t i; + xfs_buf_t *bp; + xfs_caddr_t offset = NULL; + xlog_rec_header_t *head = NULL; + int error = 0; + int smallmem = 0; + int num_blks = *last_blk - start_blk; + int xhdrs; + + ASSERT(start_blk != 0 || *last_blk != start_blk); + + if (!(bp = xlog_get_bp(log, num_blks))) { + if (!(bp = xlog_get_bp(log, 1))) + return -ENOMEM; + smallmem = 1; + } else { + error = xlog_bread(log, start_blk, num_blks, bp, &offset); + if (error) + goto out; + offset += ((num_blks - 1) << BBSHIFT); + } + + for (i = (*last_blk) - 1; i >= 0; i--) { + if (i < start_blk) { + /* valid log record not found */ + xfs_warn(log->l_mp, + "Log inconsistent (didn't find previous header)"); + ASSERT(0); + error = -EIO; + goto out; + } + + if (smallmem) { + error = xlog_bread(log, i, 1, bp, &offset); + if (error) + goto out; + } + + head = (xlog_rec_header_t *)offset; + + if (head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) + break; + + if (!smallmem) + offset -= BBSIZE; + } + + /* + * We hit the beginning of the physical log & still no header. Return + * to caller. If caller can handle a return of -1, then this routine + * will be called again for the end of the physical log. + */ + if (i == -1) { + error = 1; + goto out; + } + + /* + * We have the final block of the good log (the first block + * of the log record _before_ the head. So we check the uuid. + */ + if ((error = xlog_header_check_mount(log->l_mp, head))) + goto out; + + /* + * We may have found a log record header before we expected one. + * last_blk will be the 1st block # with a given cycle #. We may end + * up reading an entire log record. In this case, we don't want to + * reset last_blk. Only when last_blk points in the middle of a log + * record do we update last_blk. + */ + if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { + uint h_size = be32_to_cpu(head->h_size); + + xhdrs = h_size / XLOG_HEADER_CYCLE_SIZE; + if (h_size % XLOG_HEADER_CYCLE_SIZE) + xhdrs++; + } else { + xhdrs = 1; + } + + if (*last_blk - i + extra_bblks != + BTOBB(be32_to_cpu(head->h_len)) + xhdrs) + *last_blk = i; + +out: + xlog_put_bp(bp); + return error; +} + +/* + * Head is defined to be the point of the log where the next log write + * could go. This means that incomplete LR writes at the end are + * eliminated when calculating the head. We aren't guaranteed that previous + * LR have complete transactions. We only know that a cycle number of + * current cycle number -1 won't be present in the log if we start writing + * from our current block number. + * + * last_blk contains the block number of the first block with a given + * cycle number. + * + * Return: zero if normal, non-zero if error. + */ +STATIC int +xlog_find_head( + struct xlog *log, + xfs_daddr_t *return_head_blk) +{ + xfs_buf_t *bp; + xfs_caddr_t offset; + xfs_daddr_t new_blk, first_blk, start_blk, last_blk, head_blk; + int num_scan_bblks; + uint first_half_cycle, last_half_cycle; + uint stop_on_cycle; + int error, log_bbnum = log->l_logBBsize; + + /* Is the end of the log device zeroed? */ + error = xlog_find_zeroed(log, &first_blk); + if (error < 0) { + xfs_warn(log->l_mp, "empty log check failed"); + return error; + } + if (error == 1) { + *return_head_blk = first_blk; + + /* Is the whole lot zeroed? */ + if (!first_blk) { + /* Linux XFS shouldn't generate totally zeroed logs - + * mkfs etc write a dummy unmount record to a fresh + * log so we can store the uuid in there + */ + xfs_warn(log->l_mp, "totally zeroed log"); + } + + return 0; + } + + first_blk = 0; /* get cycle # of 1st block */ + bp = xlog_get_bp(log, 1); + if (!bp) + return -ENOMEM; + + error = xlog_bread(log, 0, 1, bp, &offset); + if (error) + goto bp_err; + + first_half_cycle = xlog_get_cycle(offset); + + last_blk = head_blk = log_bbnum - 1; /* get cycle # of last block */ + error = xlog_bread(log, last_blk, 1, bp, &offset); + if (error) + goto bp_err; + + last_half_cycle = xlog_get_cycle(offset); + ASSERT(last_half_cycle != 0); + + /* + * If the 1st half cycle number is equal to the last half cycle number, + * then the entire log is stamped with the same cycle number. In this + * case, head_blk can't be set to zero (which makes sense). The below + * math doesn't work out properly with head_blk equal to zero. Instead, + * we set it to log_bbnum which is an invalid block number, but this + * value makes the math correct. If head_blk doesn't changed through + * all the tests below, *head_blk is set to zero at the very end rather + * than log_bbnum. In a sense, log_bbnum and zero are the same block + * in a circular file. + */ + if (first_half_cycle == last_half_cycle) { + /* + * In this case we believe that the entire log should have + * cycle number last_half_cycle. We need to scan backwards + * from the end verifying that there are no holes still + * containing last_half_cycle - 1. If we find such a hole, + * then the start of that hole will be the new head. The + * simple case looks like + * x | x ... | x - 1 | x + * Another case that fits this picture would be + * x | x + 1 | x ... | x + * In this case the head really is somewhere at the end of the + * log, as one of the latest writes at the beginning was + * incomplete. + * One more case is + * x | x + 1 | x ... | x - 1 | x + * This is really the combination of the above two cases, and + * the head has to end up at the start of the x-1 hole at the + * end of the log. + * + * In the 256k log case, we will read from the beginning to the + * end of the log and search for cycle numbers equal to x-1. + * We don't worry about the x+1 blocks that we encounter, + * because we know that they cannot be the head since the log + * started with x. + */ + head_blk = log_bbnum; + stop_on_cycle = last_half_cycle - 1; + } else { + /* + * In this case we want to find the first block with cycle + * number matching last_half_cycle. We expect the log to be + * some variation on + * x + 1 ... | x ... | x + * The first block with cycle number x (last_half_cycle) will + * be where the new head belongs. First we do a binary search + * for the first occurrence of last_half_cycle. The binary + * search may not be totally accurate, so then we scan back + * from there looking for occurrences of last_half_cycle before + * us. If that backwards scan wraps around the beginning of + * the log, then we look for occurrences of last_half_cycle - 1 + * at the end of the log. The cases we're looking for look + * like + * v binary search stopped here + * x + 1 ... | x | x + 1 | x ... | x + * ^ but we want to locate this spot + * or + * <---------> less than scan distance + * x + 1 ... | x ... | x - 1 | x + * ^ we want to locate this spot + */ + stop_on_cycle = last_half_cycle; + if ((error = xlog_find_cycle_start(log, bp, first_blk, + &head_blk, last_half_cycle))) + goto bp_err; + } + + /* + * Now validate the answer. Scan back some number of maximum possible + * blocks and make sure each one has the expected cycle number. The + * maximum is determined by the total possible amount of buffering + * in the in-core log. The following number can be made tighter if + * we actually look at the block size of the filesystem. + */ + num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log); + if (head_blk >= num_scan_bblks) { + /* + * We are guaranteed that the entire check can be performed + * in one buffer. + */ + start_blk = head_blk - num_scan_bblks; + if ((error = xlog_find_verify_cycle(log, + start_blk, num_scan_bblks, + stop_on_cycle, &new_blk))) + goto bp_err; + if (new_blk != -1) + head_blk = new_blk; + } else { /* need to read 2 parts of log */ + /* + * We are going to scan backwards in the log in two parts. + * First we scan the physical end of the log. In this part + * of the log, we are looking for blocks with cycle number + * last_half_cycle - 1. + * If we find one, then we know that the log starts there, as + * we've found a hole that didn't get written in going around + * the end of the physical log. The simple case for this is + * x + 1 ... | x ... | x - 1 | x + * <---------> less than scan distance + * If all of the blocks at the end of the log have cycle number + * last_half_cycle, then we check the blocks at the start of + * the log looking for occurrences of last_half_cycle. If we + * find one, then our current estimate for the location of the + * first occurrence of last_half_cycle is wrong and we move + * back to the hole we've found. This case looks like + * x + 1 ... | x | x + 1 | x ... + * ^ binary search stopped here + * Another case we need to handle that only occurs in 256k + * logs is + * x + 1 ... | x ... | x+1 | x ... + * ^ binary search stops here + * In a 256k log, the scan at the end of the log will see the + * x + 1 blocks. We need to skip past those since that is + * certainly not the head of the log. By searching for + * last_half_cycle-1 we accomplish that. + */ + ASSERT(head_blk <= INT_MAX && + (xfs_daddr_t) num_scan_bblks >= head_blk); + start_blk = log_bbnum - (num_scan_bblks - head_blk); + if ((error = xlog_find_verify_cycle(log, start_blk, + num_scan_bblks - (int)head_blk, + (stop_on_cycle - 1), &new_blk))) + goto bp_err; + if (new_blk != -1) { + head_blk = new_blk; + goto validate_head; + } + + /* + * Scan beginning of log now. The last part of the physical + * log is good. This scan needs to verify that it doesn't find + * the last_half_cycle. + */ + start_blk = 0; + ASSERT(head_blk <= INT_MAX); + if ((error = xlog_find_verify_cycle(log, + start_blk, (int)head_blk, + stop_on_cycle, &new_blk))) + goto bp_err; + if (new_blk != -1) + head_blk = new_blk; + } + +validate_head: + /* + * Now we need to make sure head_blk is not pointing to a block in + * the middle of a log record. + */ + num_scan_bblks = XLOG_REC_SHIFT(log); + if (head_blk >= num_scan_bblks) { + start_blk = head_blk - num_scan_bblks; /* don't read head_blk */ + + /* start ptr at last block ptr before head_blk */ + error = xlog_find_verify_log_record(log, start_blk, &head_blk, 0); + if (error == 1) + error = -EIO; + if (error) + goto bp_err; + } else { + start_blk = 0; + ASSERT(head_blk <= INT_MAX); + error = xlog_find_verify_log_record(log, start_blk, &head_blk, 0); + if (error < 0) + goto bp_err; + if (error == 1) { + /* We hit the beginning of the log during our search */ + start_blk = log_bbnum - (num_scan_bblks - head_blk); + new_blk = log_bbnum; + ASSERT(start_blk <= INT_MAX && + (xfs_daddr_t) log_bbnum-start_blk >= 0); + ASSERT(head_blk <= INT_MAX); + error = xlog_find_verify_log_record(log, start_blk, + &new_blk, (int)head_blk); + if (error == 1) + error = -EIO; + if (error) + goto bp_err; + if (new_blk != log_bbnum) + head_blk = new_blk; + } else if (error) + goto bp_err; + } + + xlog_put_bp(bp); + if (head_blk == log_bbnum) + *return_head_blk = 0; + else + *return_head_blk = head_blk; + /* + * When returning here, we have a good block number. Bad block + * means that during a previous crash, we didn't have a clean break + * from cycle number N to cycle number N-1. In this case, we need + * to find the first block with cycle number N-1. + */ + return 0; + + bp_err: + xlog_put_bp(bp); + + if (error) + xfs_warn(log->l_mp, "failed to find log head"); + return error; +} + +/* + * Find the sync block number or the tail of the log. + * + * This will be the block number of the last record to have its + * associated buffers synced to disk. Every log record header has + * a sync lsn embedded in it. LSNs hold block numbers, so it is easy + * to get a sync block number. The only concern is to figure out which + * log record header to believe. + * + * The following algorithm uses the log record header with the largest + * lsn. The entire log record does not need to be valid. We only care + * that the header is valid. + * + * We could speed up search by using current head_blk buffer, but it is not + * available. + */ +STATIC int +xlog_find_tail( + struct xlog *log, + xfs_daddr_t *head_blk, + xfs_daddr_t *tail_blk) +{ + xlog_rec_header_t *rhead; + xlog_op_header_t *op_head; + xfs_caddr_t offset = NULL; + xfs_buf_t *bp; + int error, i, found; + xfs_daddr_t umount_data_blk; + xfs_daddr_t after_umount_blk; + xfs_lsn_t tail_lsn; + int hblks; + + found = 0; + + /* + * Find previous log record + */ + if ((error = xlog_find_head(log, head_blk))) + return error; + + bp = xlog_get_bp(log, 1); + if (!bp) + return -ENOMEM; + if (*head_blk == 0) { /* special case */ + error = xlog_bread(log, 0, 1, bp, &offset); + if (error) + goto done; + + if (xlog_get_cycle(offset) == 0) { + *tail_blk = 0; + /* leave all other log inited values alone */ + goto done; + } + } + + /* + * Search backwards looking for log record header block + */ + ASSERT(*head_blk < INT_MAX); + for (i = (int)(*head_blk) - 1; i >= 0; i--) { + error = xlog_bread(log, i, 1, bp, &offset); + if (error) + goto done; + + if (*(__be32 *)offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) { + found = 1; + break; + } + } + /* + * If we haven't found the log record header block, start looking + * again from the end of the physical log. XXXmiken: There should be + * a check here to make sure we didn't search more than N blocks in + * the previous code. + */ + if (!found) { + for (i = log->l_logBBsize - 1; i >= (int)(*head_blk); i--) { + error = xlog_bread(log, i, 1, bp, &offset); + if (error) + goto done; + + if (*(__be32 *)offset == + cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) { + found = 2; + break; + } + } + } + if (!found) { + xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__); + xlog_put_bp(bp); + ASSERT(0); + return -EIO; + } + + /* find blk_no of tail of log */ + rhead = (xlog_rec_header_t *)offset; + *tail_blk = BLOCK_LSN(be64_to_cpu(rhead->h_tail_lsn)); + + /* + * Reset log values according to the state of the log when we + * crashed. In the case where head_blk == 0, we bump curr_cycle + * one because the next write starts a new cycle rather than + * continuing the cycle of the last good log record. At this + * point we have guaranteed that all partial log records have been + * accounted for. Therefore, we know that the last good log record + * written was complete and ended exactly on the end boundary + * of the physical log. + */ + log->l_prev_block = i; + log->l_curr_block = (int)*head_blk; + log->l_curr_cycle = be32_to_cpu(rhead->h_cycle); + if (found == 2) + log->l_curr_cycle++; + atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn)); + atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn)); + xlog_assign_grant_head(&log->l_reserve_head.grant, log->l_curr_cycle, + BBTOB(log->l_curr_block)); + xlog_assign_grant_head(&log->l_write_head.grant, log->l_curr_cycle, + BBTOB(log->l_curr_block)); + + /* + * Look for unmount record. If we find it, then we know there + * was a clean unmount. Since 'i' could be the last block in + * the physical log, we convert to a log block before comparing + * to the head_blk. + * + * Save the current tail lsn to use to pass to + * xlog_clear_stale_blocks() below. We won't want to clear the + * unmount record if there is one, so we pass the lsn of the + * unmount record rather than the block after it. + */ + if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { + int h_size = be32_to_cpu(rhead->h_size); + int h_version = be32_to_cpu(rhead->h_version); + + if ((h_version & XLOG_VERSION_2) && + (h_size > XLOG_HEADER_CYCLE_SIZE)) { + hblks = h_size / XLOG_HEADER_CYCLE_SIZE; + if (h_size % XLOG_HEADER_CYCLE_SIZE) + hblks++; + } else { + hblks = 1; + } + } else { + hblks = 1; + } + after_umount_blk = (i + hblks + (int) + BTOBB(be32_to_cpu(rhead->h_len))) % log->l_logBBsize; + tail_lsn = atomic64_read(&log->l_tail_lsn); + if (*head_blk == after_umount_blk && + be32_to_cpu(rhead->h_num_logops) == 1) { + umount_data_blk = (i + hblks) % log->l_logBBsize; + error = xlog_bread(log, umount_data_blk, 1, bp, &offset); + if (error) + goto done; + + op_head = (xlog_op_header_t *)offset; + if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) { + /* + * Set tail and last sync so that newly written + * log records will point recovery to after the + * current unmount record. + */ + xlog_assign_atomic_lsn(&log->l_tail_lsn, + log->l_curr_cycle, after_umount_blk); + xlog_assign_atomic_lsn(&log->l_last_sync_lsn, + log->l_curr_cycle, after_umount_blk); + *tail_blk = after_umount_blk; + + /* + * Note that the unmount was clean. If the unmount + * was not clean, we need to know this to rebuild the + * superblock counters from the perag headers if we + * have a filesystem using non-persistent counters. + */ + log->l_mp->m_flags |= XFS_MOUNT_WAS_CLEAN; + } + } + + /* + * Make sure that there are no blocks in front of the head + * with the same cycle number as the head. This can happen + * because we allow multiple outstanding log writes concurrently, + * and the later writes might make it out before earlier ones. + * + * We use the lsn from before modifying it so that we'll never + * overwrite the unmount record after a clean unmount. + * + * Do this only if we are going to recover the filesystem + * + * NOTE: This used to say "if (!readonly)" + * However on Linux, we can & do recover a read-only filesystem. + * We only skip recovery if NORECOVERY is specified on mount, + * in which case we would not be here. + * + * But... if the -device- itself is readonly, just skip this. + * We can't recover this device anyway, so it won't matter. + */ + if (!xfs_readonly_buftarg(log->l_mp->m_logdev_targp)) + error = xlog_clear_stale_blocks(log, tail_lsn); + +done: + xlog_put_bp(bp); + + if (error) + xfs_warn(log->l_mp, "failed to locate log tail"); + return error; +} + +/* + * Is the log zeroed at all? + * + * The last binary search should be changed to perform an X block read + * once X becomes small enough. You can then search linearly through + * the X blocks. This will cut down on the number of reads we need to do. + * + * If the log is partially zeroed, this routine will pass back the blkno + * of the first block with cycle number 0. It won't have a complete LR + * preceding it. + * + * Return: + * 0 => the log is completely written to + * 1 => use *blk_no as the first block of the log + * <0 => error has occurred + */ +STATIC int +xlog_find_zeroed( + struct xlog *log, + xfs_daddr_t *blk_no) +{ + xfs_buf_t *bp; + xfs_caddr_t offset; + uint first_cycle, last_cycle; + xfs_daddr_t new_blk, last_blk, start_blk; + xfs_daddr_t num_scan_bblks; + int error, log_bbnum = log->l_logBBsize; + + *blk_no = 0; + + /* check totally zeroed log */ + bp = xlog_get_bp(log, 1); + if (!bp) + return -ENOMEM; + error = xlog_bread(log, 0, 1, bp, &offset); + if (error) + goto bp_err; + + first_cycle = xlog_get_cycle(offset); + if (first_cycle == 0) { /* completely zeroed log */ + *blk_no = 0; + xlog_put_bp(bp); + return 1; + } + + /* check partially zeroed log */ + error = xlog_bread(log, log_bbnum-1, 1, bp, &offset); + if (error) + goto bp_err; + + last_cycle = xlog_get_cycle(offset); + if (last_cycle != 0) { /* log completely written to */ + xlog_put_bp(bp); + return 0; + } else if (first_cycle != 1) { + /* + * If the cycle of the last block is zero, the cycle of + * the first block must be 1. If it's not, maybe we're + * not looking at a log... Bail out. + */ + xfs_warn(log->l_mp, + "Log inconsistent or not a log (last==0, first!=1)"); + error = -EINVAL; + goto bp_err; + } + + /* we have a partially zeroed log */ + last_blk = log_bbnum-1; + if ((error = xlog_find_cycle_start(log, bp, 0, &last_blk, 0))) + goto bp_err; + + /* + * Validate the answer. Because there is no way to guarantee that + * the entire log is made up of log records which are the same size, + * we scan over the defined maximum blocks. At this point, the maximum + * is not chosen to mean anything special. XXXmiken + */ + num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log); + ASSERT(num_scan_bblks <= INT_MAX); + + if (last_blk < num_scan_bblks) + num_scan_bblks = last_blk; + start_blk = last_blk - num_scan_bblks; + + /* + * We search for any instances of cycle number 0 that occur before + * our current estimate of the head. What we're trying to detect is + * 1 ... | 0 | 1 | 0... + * ^ binary search ends here + */ + if ((error = xlog_find_verify_cycle(log, start_blk, + (int)num_scan_bblks, 0, &new_blk))) + goto bp_err; + if (new_blk != -1) + last_blk = new_blk; + + /* + * Potentially backup over partial log record write. We don't need + * to search the end of the log because we know it is zero. + */ + error = xlog_find_verify_log_record(log, start_blk, &last_blk, 0); + if (error == 1) + error = -EIO; + if (error) + goto bp_err; + + *blk_no = last_blk; +bp_err: + xlog_put_bp(bp); + if (error) + return error; + return 1; +} + +/* + * These are simple subroutines used by xlog_clear_stale_blocks() below + * to initialize a buffer full of empty log record headers and write + * them into the log. + */ +STATIC void +xlog_add_record( + struct xlog *log, + xfs_caddr_t buf, + int cycle, + int block, + int tail_cycle, + int tail_block) +{ + xlog_rec_header_t *recp = (xlog_rec_header_t *)buf; + + memset(buf, 0, BBSIZE); + recp->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM); + recp->h_cycle = cpu_to_be32(cycle); + recp->h_version = cpu_to_be32( + xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? 2 : 1); + recp->h_lsn = cpu_to_be64(xlog_assign_lsn(cycle, block)); + recp->h_tail_lsn = cpu_to_be64(xlog_assign_lsn(tail_cycle, tail_block)); + recp->h_fmt = cpu_to_be32(XLOG_FMT); + memcpy(&recp->h_fs_uuid, &log->l_mp->m_sb.sb_uuid, sizeof(uuid_t)); +} + +STATIC int +xlog_write_log_records( + struct xlog *log, + int cycle, + int start_block, + int blocks, + int tail_cycle, + int tail_block) +{ + xfs_caddr_t offset; + xfs_buf_t *bp; + int balign, ealign; + int sectbb = log->l_sectBBsize; + int end_block = start_block + blocks; + int bufblks; + int error = 0; + int i, j = 0; + + /* + * Greedily allocate a buffer big enough to handle the full + * range of basic blocks to be written. If that fails, try + * a smaller size. We need to be able to write at least a + * log sector, or we're out of luck. + */ + bufblks = 1 << ffs(blocks); + while (bufblks > log->l_logBBsize) + bufblks >>= 1; + while (!(bp = xlog_get_bp(log, bufblks))) { + bufblks >>= 1; + if (bufblks < sectbb) + return -ENOMEM; + } + + /* We may need to do a read at the start to fill in part of + * the buffer in the starting sector not covered by the first + * write below. + */ + balign = round_down(start_block, sectbb); + if (balign != start_block) { + error = xlog_bread_noalign(log, start_block, 1, bp); + if (error) + goto out_put_bp; + + j = start_block - balign; + } + + for (i = start_block; i < end_block; i += bufblks) { + int bcount, endcount; + + bcount = min(bufblks, end_block - start_block); + endcount = bcount - j; + + /* We may need to do a read at the end to fill in part of + * the buffer in the final sector not covered by the write. + * If this is the same sector as the above read, skip it. + */ + ealign = round_down(end_block, sectbb); + if (j == 0 && (start_block + endcount > ealign)) { + offset = bp->b_addr + BBTOB(ealign - start_block); + error = xlog_bread_offset(log, ealign, sectbb, + bp, offset); + if (error) + break; + + } + + offset = xlog_align(log, start_block, endcount, bp); + for (; j < endcount; j++) { + xlog_add_record(log, offset, cycle, i+j, + tail_cycle, tail_block); + offset += BBSIZE; + } + error = xlog_bwrite(log, start_block, endcount, bp); + if (error) + break; + start_block += endcount; + j = 0; + } + + out_put_bp: + xlog_put_bp(bp); + return error; +} + +/* + * This routine is called to blow away any incomplete log writes out + * in front of the log head. We do this so that we won't become confused + * if we come up, write only a little bit more, and then crash again. + * If we leave the partial log records out there, this situation could + * cause us to think those partial writes are valid blocks since they + * have the current cycle number. We get rid of them by overwriting them + * with empty log records with the old cycle number rather than the + * current one. + * + * The tail lsn is passed in rather than taken from + * the log so that we will not write over the unmount record after a + * clean unmount in a 512 block log. Doing so would leave the log without + * any valid log records in it until a new one was written. If we crashed + * during that time we would not be able to recover. + */ +STATIC int +xlog_clear_stale_blocks( + struct xlog *log, + xfs_lsn_t tail_lsn) +{ + int tail_cycle, head_cycle; + int tail_block, head_block; + int tail_distance, max_distance; + int distance; + int error; + + tail_cycle = CYCLE_LSN(tail_lsn); + tail_block = BLOCK_LSN(tail_lsn); + head_cycle = log->l_curr_cycle; + head_block = log->l_curr_block; + + /* + * Figure out the distance between the new head of the log + * and the tail. We want to write over any blocks beyond the + * head that we may have written just before the crash, but + * we don't want to overwrite the tail of the log. + */ + if (head_cycle == tail_cycle) { + /* + * The tail is behind the head in the physical log, + * so the distance from the head to the tail is the + * distance from the head to the end of the log plus + * the distance from the beginning of the log to the + * tail. + */ + if (unlikely(head_block < tail_block || head_block >= log->l_logBBsize)) { + XFS_ERROR_REPORT("xlog_clear_stale_blocks(1)", + XFS_ERRLEVEL_LOW, log->l_mp); + return -EFSCORRUPTED; + } + tail_distance = tail_block + (log->l_logBBsize - head_block); + } else { + /* + * The head is behind the tail in the physical log, + * so the distance from the head to the tail is just + * the tail block minus the head block. + */ + if (unlikely(head_block >= tail_block || head_cycle != (tail_cycle + 1))){ + XFS_ERROR_REPORT("xlog_clear_stale_blocks(2)", + XFS_ERRLEVEL_LOW, log->l_mp); + return -EFSCORRUPTED; + } + tail_distance = tail_block - head_block; + } + + /* + * If the head is right up against the tail, we can't clear + * anything. + */ + if (tail_distance <= 0) { + ASSERT(tail_distance == 0); + return 0; + } + + max_distance = XLOG_TOTAL_REC_SHIFT(log); + /* + * Take the smaller of the maximum amount of outstanding I/O + * we could have and the distance to the tail to clear out. + * We take the smaller so that we don't overwrite the tail and + * we don't waste all day writing from the head to the tail + * for no reason. + */ + max_distance = MIN(max_distance, tail_distance); + + if ((head_block + max_distance) <= log->l_logBBsize) { + /* + * We can stomp all the blocks we need to without + * wrapping around the end of the log. Just do it + * in a single write. Use the cycle number of the + * current cycle minus one so that the log will look like: + * n ... | n - 1 ... + */ + error = xlog_write_log_records(log, (head_cycle - 1), + head_block, max_distance, tail_cycle, + tail_block); + if (error) + return error; + } else { + /* + * We need to wrap around the end of the physical log in + * order to clear all the blocks. Do it in two separate + * I/Os. The first write should be from the head to the + * end of the physical log, and it should use the current + * cycle number minus one just like above. + */ + distance = log->l_logBBsize - head_block; + error = xlog_write_log_records(log, (head_cycle - 1), + head_block, distance, tail_cycle, + tail_block); + + if (error) + return error; + + /* + * Now write the blocks at the start of the physical log. + * This writes the remainder of the blocks we want to clear. + * It uses the current cycle number since we're now on the + * same cycle as the head so that we get: + * n ... n ... | n - 1 ... + * ^^^^^ blocks we're writing + */ + distance = max_distance - (log->l_logBBsize - head_block); + error = xlog_write_log_records(log, head_cycle, 0, distance, + tail_cycle, tail_block); + if (error) + return error; + } + + return 0; +} + +/****************************************************************************** + * + * Log recover routines + * + ****************************************************************************** + */ + +/* + * Sort the log items in the transaction. + * + * The ordering constraints are defined by the inode allocation and unlink + * behaviour. The rules are: + * + * 1. Every item is only logged once in a given transaction. Hence it + * represents the last logged state of the item. Hence ordering is + * dependent on the order in which operations need to be performed so + * required initial conditions are always met. + * + * 2. Cancelled buffers are recorded in pass 1 in a separate table and + * there's nothing to replay from them so we can simply cull them + * from the transaction. However, we can't do that until after we've + * replayed all the other items because they may be dependent on the + * cancelled buffer and replaying the cancelled buffer can remove it + * form the cancelled buffer table. Hence they have tobe done last. + * + * 3. Inode allocation buffers must be replayed before inode items that + * read the buffer and replay changes into it. For filesystems using the + * ICREATE transactions, this means XFS_LI_ICREATE objects need to get + * treated the same as inode allocation buffers as they create and + * initialise the buffers directly. + * + * 4. Inode unlink buffers must be replayed after inode items are replayed. + * This ensures that inodes are completely flushed to the inode buffer + * in a "free" state before we remove the unlinked inode list pointer. + * + * Hence the ordering needs to be inode allocation buffers first, inode items + * second, inode unlink buffers third and cancelled buffers last. + * + * But there's a problem with that - we can't tell an inode allocation buffer + * apart from a regular buffer, so we can't separate them. We can, however, + * tell an inode unlink buffer from the others, and so we can separate them out + * from all the other buffers and move them to last. + * + * Hence, 4 lists, in order from head to tail: + * - buffer_list for all buffers except cancelled/inode unlink buffers + * - item_list for all non-buffer items + * - inode_buffer_list for inode unlink buffers + * - cancel_list for the cancelled buffers + * + * Note that we add objects to the tail of the lists so that first-to-last + * ordering is preserved within the lists. Adding objects to the head of the + * list means when we traverse from the head we walk them in last-to-first + * order. For cancelled buffers and inode unlink buffers this doesn't matter, + * but for all other items there may be specific ordering that we need to + * preserve. + */ +STATIC int +xlog_recover_reorder_trans( + struct xlog *log, + struct xlog_recover *trans, + int pass) +{ + xlog_recover_item_t *item, *n; + int error = 0; + LIST_HEAD(sort_list); + LIST_HEAD(cancel_list); + LIST_HEAD(buffer_list); + LIST_HEAD(inode_buffer_list); + LIST_HEAD(inode_list); + + list_splice_init(&trans->r_itemq, &sort_list); + list_for_each_entry_safe(item, n, &sort_list, ri_list) { + xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr; + + switch (ITEM_TYPE(item)) { + case XFS_LI_ICREATE: + list_move_tail(&item->ri_list, &buffer_list); + break; + case XFS_LI_BUF: + if (buf_f->blf_flags & XFS_BLF_CANCEL) { + trace_xfs_log_recover_item_reorder_head(log, + trans, item, pass); + list_move(&item->ri_list, &cancel_list); + break; + } + if (buf_f->blf_flags & XFS_BLF_INODE_BUF) { + list_move(&item->ri_list, &inode_buffer_list); + break; + } + list_move_tail(&item->ri_list, &buffer_list); + break; + case XFS_LI_INODE: + case XFS_LI_DQUOT: + case XFS_LI_QUOTAOFF: + case XFS_LI_EFD: + case XFS_LI_EFI: + trace_xfs_log_recover_item_reorder_tail(log, + trans, item, pass); + list_move_tail(&item->ri_list, &inode_list); + break; + default: + xfs_warn(log->l_mp, + "%s: unrecognized type of log operation", + __func__); + ASSERT(0); + /* + * return the remaining items back to the transaction + * item list so they can be freed in caller. + */ + if (!list_empty(&sort_list)) + list_splice_init(&sort_list, &trans->r_itemq); + error = -EIO; + goto out; + } + } +out: + ASSERT(list_empty(&sort_list)); + if (!list_empty(&buffer_list)) + list_splice(&buffer_list, &trans->r_itemq); + if (!list_empty(&inode_list)) + list_splice_tail(&inode_list, &trans->r_itemq); + if (!list_empty(&inode_buffer_list)) + list_splice_tail(&inode_buffer_list, &trans->r_itemq); + if (!list_empty(&cancel_list)) + list_splice_tail(&cancel_list, &trans->r_itemq); + return error; +} + +/* + * Build up the table of buf cancel records so that we don't replay + * cancelled data in the second pass. For buffer records that are + * not cancel records, there is nothing to do here so we just return. + * + * If we get a cancel record which is already in the table, this indicates + * that the buffer was cancelled multiple times. In order to ensure + * that during pass 2 we keep the record in the table until we reach its + * last occurrence in the log, we keep a reference count in the cancel + * record in the table to tell us how many times we expect to see this + * record during the second pass. + */ +STATIC int +xlog_recover_buffer_pass1( + struct xlog *log, + struct xlog_recover_item *item) +{ + xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr; + struct list_head *bucket; + struct xfs_buf_cancel *bcp; + + /* + * If this isn't a cancel buffer item, then just return. + */ + if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) { + trace_xfs_log_recover_buf_not_cancel(log, buf_f); + return 0; + } + + /* + * Insert an xfs_buf_cancel record into the hash table of them. + * If there is already an identical record, bump its reference count. + */ + bucket = XLOG_BUF_CANCEL_BUCKET(log, buf_f->blf_blkno); + list_for_each_entry(bcp, bucket, bc_list) { + if (bcp->bc_blkno == buf_f->blf_blkno && + bcp->bc_len == buf_f->blf_len) { + bcp->bc_refcount++; + trace_xfs_log_recover_buf_cancel_ref_inc(log, buf_f); + return 0; + } + } + + bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), KM_SLEEP); + bcp->bc_blkno = buf_f->blf_blkno; + bcp->bc_len = buf_f->blf_len; + bcp->bc_refcount = 1; + list_add_tail(&bcp->bc_list, bucket); + + trace_xfs_log_recover_buf_cancel_add(log, buf_f); + return 0; +} + +/* + * Check to see whether the buffer being recovered has a corresponding + * entry in the buffer cancel record table. If it is, return the cancel + * buffer structure to the caller. + */ +STATIC struct xfs_buf_cancel * +xlog_peek_buffer_cancelled( + struct xlog *log, + xfs_daddr_t blkno, + uint len, + ushort flags) +{ + struct list_head *bucket; + struct xfs_buf_cancel *bcp; + + if (!log->l_buf_cancel_table) { + /* empty table means no cancelled buffers in the log */ + ASSERT(!(flags & XFS_BLF_CANCEL)); + return NULL; + } + + bucket = XLOG_BUF_CANCEL_BUCKET(log, blkno); + list_for_each_entry(bcp, bucket, bc_list) { + if (bcp->bc_blkno == blkno && bcp->bc_len == len) + return bcp; + } + + /* + * We didn't find a corresponding entry in the table, so return 0 so + * that the buffer is NOT cancelled. + */ + ASSERT(!(flags & XFS_BLF_CANCEL)); + return NULL; +} + +/* + * If the buffer is being cancelled then return 1 so that it will be cancelled, + * otherwise return 0. If the buffer is actually a buffer cancel item + * (XFS_BLF_CANCEL is set), then decrement the refcount on the entry in the + * table and remove it from the table if this is the last reference. + * + * We remove the cancel record from the table when we encounter its last + * occurrence in the log so that if the same buffer is re-used again after its + * last cancellation we actually replay the changes made at that point. + */ +STATIC int +xlog_check_buffer_cancelled( + struct xlog *log, + xfs_daddr_t blkno, + uint len, + ushort flags) +{ + struct xfs_buf_cancel *bcp; + + bcp = xlog_peek_buffer_cancelled(log, blkno, len, flags); + if (!bcp) + return 0; + + /* + * We've go a match, so return 1 so that the recovery of this buffer + * is cancelled. If this buffer is actually a buffer cancel log + * item, then decrement the refcount on the one in the table and + * remove it if this is the last reference. + */ + if (flags & XFS_BLF_CANCEL) { + if (--bcp->bc_refcount == 0) { + list_del(&bcp->bc_list); + kmem_free(bcp); + } + } + return 1; +} + +/* + * Perform recovery for a buffer full of inodes. In these buffers, the only + * data which should be recovered is that which corresponds to the + * di_next_unlinked pointers in the on disk inode structures. The rest of the + * data for the inodes is always logged through the inodes themselves rather + * than the inode buffer and is recovered in xlog_recover_inode_pass2(). + * + * The only time when buffers full of inodes are fully recovered is when the + * buffer is full of newly allocated inodes. In this case the buffer will + * not be marked as an inode buffer and so will be sent to + * xlog_recover_do_reg_buffer() below during recovery. + */ +STATIC int +xlog_recover_do_inode_buffer( + struct xfs_mount *mp, + xlog_recover_item_t *item, + struct xfs_buf *bp, + xfs_buf_log_format_t *buf_f) +{ + int i; + int item_index = 0; + int bit = 0; + int nbits = 0; + int reg_buf_offset = 0; + int reg_buf_bytes = 0; + int next_unlinked_offset; + int inodes_per_buf; + xfs_agino_t *logged_nextp; + xfs_agino_t *buffer_nextp; + + trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f); + + /* + * Post recovery validation only works properly on CRC enabled + * filesystems. + */ + if (xfs_sb_version_hascrc(&mp->m_sb)) + bp->b_ops = &xfs_inode_buf_ops; + + inodes_per_buf = BBTOB(bp->b_io_length) >> mp->m_sb.sb_inodelog; + for (i = 0; i < inodes_per_buf; i++) { + next_unlinked_offset = (i * mp->m_sb.sb_inodesize) + + offsetof(xfs_dinode_t, di_next_unlinked); + + while (next_unlinked_offset >= + (reg_buf_offset + reg_buf_bytes)) { + /* + * The next di_next_unlinked field is beyond + * the current logged region. Find the next + * logged region that contains or is beyond + * the current di_next_unlinked field. + */ + bit += nbits; + bit = xfs_next_bit(buf_f->blf_data_map, + buf_f->blf_map_size, bit); + + /* + * If there are no more logged regions in the + * buffer, then we're done. + */ + if (bit == -1) + return 0; + + nbits = xfs_contig_bits(buf_f->blf_data_map, + buf_f->blf_map_size, bit); + ASSERT(nbits > 0); + reg_buf_offset = bit << XFS_BLF_SHIFT; + reg_buf_bytes = nbits << XFS_BLF_SHIFT; + item_index++; + } + + /* + * If the current logged region starts after the current + * di_next_unlinked field, then move on to the next + * di_next_unlinked field. + */ + if (next_unlinked_offset < reg_buf_offset) + continue; + + ASSERT(item->ri_buf[item_index].i_addr != NULL); + ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0); + ASSERT((reg_buf_offset + reg_buf_bytes) <= + BBTOB(bp->b_io_length)); + + /* + * The current logged region contains a copy of the + * current di_next_unlinked field. Extract its value + * and copy it to the buffer copy. + */ + logged_nextp = item->ri_buf[item_index].i_addr + + next_unlinked_offset - reg_buf_offset; + if (unlikely(*logged_nextp == 0)) { + xfs_alert(mp, + "Bad inode buffer log record (ptr = 0x%p, bp = 0x%p). " + "Trying to replay bad (0) inode di_next_unlinked field.", + item, bp); + XFS_ERROR_REPORT("xlog_recover_do_inode_buf", + XFS_ERRLEVEL_LOW, mp); + return -EFSCORRUPTED; + } + + buffer_nextp = (xfs_agino_t *)xfs_buf_offset(bp, + next_unlinked_offset); + *buffer_nextp = *logged_nextp; + + /* + * If necessary, recalculate the CRC in the on-disk inode. We + * have to leave the inode in a consistent state for whoever + * reads it next.... + */ + xfs_dinode_calc_crc(mp, (struct xfs_dinode *) + xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize)); + + } + + return 0; +} + +/* + * V5 filesystems know the age of the buffer on disk being recovered. We can + * have newer objects on disk than we are replaying, and so for these cases we + * don't want to replay the current change as that will make the buffer contents + * temporarily invalid on disk. + * + * The magic number might not match the buffer type we are going to recover + * (e.g. reallocated blocks), so we ignore the xfs_buf_log_format flags. Hence + * extract the LSN of the existing object in the buffer based on it's current + * magic number. If we don't recognise the magic number in the buffer, then + * return a LSN of -1 so that the caller knows it was an unrecognised block and + * so can recover the buffer. + * + * Note: we cannot rely solely on magic number matches to determine that the + * buffer has a valid LSN - we also need to verify that it belongs to this + * filesystem, so we need to extract the object's LSN and compare it to that + * which we read from the superblock. If the UUIDs don't match, then we've got a + * stale metadata block from an old filesystem instance that we need to recover + * over the top of. + */ +static xfs_lsn_t +xlog_recover_get_buf_lsn( + struct xfs_mount *mp, + struct xfs_buf *bp) +{ + __uint32_t magic32; + __uint16_t magic16; + __uint16_t magicda; + void *blk = bp->b_addr; + uuid_t *uuid; + xfs_lsn_t lsn = -1; + + /* v4 filesystems always recover immediately */ + if (!xfs_sb_version_hascrc(&mp->m_sb)) + goto recover_immediately; + + magic32 = be32_to_cpu(*(__be32 *)blk); + switch (magic32) { + case XFS_ABTB_CRC_MAGIC: + case XFS_ABTC_CRC_MAGIC: + case XFS_ABTB_MAGIC: + case XFS_ABTC_MAGIC: + case XFS_IBT_CRC_MAGIC: + case XFS_IBT_MAGIC: { + struct xfs_btree_block *btb = blk; + + lsn = be64_to_cpu(btb->bb_u.s.bb_lsn); + uuid = &btb->bb_u.s.bb_uuid; + break; + } + case XFS_BMAP_CRC_MAGIC: + case XFS_BMAP_MAGIC: { + struct xfs_btree_block *btb = blk; + + lsn = be64_to_cpu(btb->bb_u.l.bb_lsn); + uuid = &btb->bb_u.l.bb_uuid; + break; + } + case XFS_AGF_MAGIC: + lsn = be64_to_cpu(((struct xfs_agf *)blk)->agf_lsn); + uuid = &((struct xfs_agf *)blk)->agf_uuid; + break; + case XFS_AGFL_MAGIC: + lsn = be64_to_cpu(((struct xfs_agfl *)blk)->agfl_lsn); + uuid = &((struct xfs_agfl *)blk)->agfl_uuid; + break; + case XFS_AGI_MAGIC: + lsn = be64_to_cpu(((struct xfs_agi *)blk)->agi_lsn); + uuid = &((struct xfs_agi *)blk)->agi_uuid; + break; + case XFS_SYMLINK_MAGIC: + lsn = be64_to_cpu(((struct xfs_dsymlink_hdr *)blk)->sl_lsn); + uuid = &((struct xfs_dsymlink_hdr *)blk)->sl_uuid; + break; + case XFS_DIR3_BLOCK_MAGIC: + case XFS_DIR3_DATA_MAGIC: + case XFS_DIR3_FREE_MAGIC: + lsn = be64_to_cpu(((struct xfs_dir3_blk_hdr *)blk)->lsn); + uuid = &((struct xfs_dir3_blk_hdr *)blk)->uuid; + break; + case XFS_ATTR3_RMT_MAGIC: + lsn = be64_to_cpu(((struct xfs_attr3_rmt_hdr *)blk)->rm_lsn); + uuid = &((struct xfs_attr3_rmt_hdr *)blk)->rm_uuid; + break; + case XFS_SB_MAGIC: + lsn = be64_to_cpu(((struct xfs_dsb *)blk)->sb_lsn); + uuid = &((struct xfs_dsb *)blk)->sb_uuid; + break; + default: + break; + } + + if (lsn != (xfs_lsn_t)-1) { + if (!uuid_equal(&mp->m_sb.sb_uuid, uuid)) + goto recover_immediately; + return lsn; + } + + magicda = be16_to_cpu(((struct xfs_da_blkinfo *)blk)->magic); + switch (magicda) { + case XFS_DIR3_LEAF1_MAGIC: + case XFS_DIR3_LEAFN_MAGIC: + case XFS_DA3_NODE_MAGIC: + lsn = be64_to_cpu(((struct xfs_da3_blkinfo *)blk)->lsn); + uuid = &((struct xfs_da3_blkinfo *)blk)->uuid; + break; + default: + break; + } + + if (lsn != (xfs_lsn_t)-1) { + if (!uuid_equal(&mp->m_sb.sb_uuid, uuid)) + goto recover_immediately; + return lsn; + } + + /* + * We do individual object checks on dquot and inode buffers as they + * have their own individual LSN records. Also, we could have a stale + * buffer here, so we have to at least recognise these buffer types. + * + * A notd complexity here is inode unlinked list processing - it logs + * the inode directly in the buffer, but we don't know which inodes have + * been modified, and there is no global buffer LSN. Hence we need to + * recover all inode buffer types immediately. This problem will be + * fixed by logical logging of the unlinked list modifications. + */ + magic16 = be16_to_cpu(*(__be16 *)blk); + switch (magic16) { + case XFS_DQUOT_MAGIC: + case XFS_DINODE_MAGIC: + goto recover_immediately; + default: + break; + } + + /* unknown buffer contents, recover immediately */ + +recover_immediately: + return (xfs_lsn_t)-1; + +} + +/* + * Validate the recovered buffer is of the correct type and attach the + * appropriate buffer operations to them for writeback. Magic numbers are in a + * few places: + * the first 16 bits of the buffer (inode buffer, dquot buffer), + * the first 32 bits of the buffer (most blocks), + * inside a struct xfs_da_blkinfo at the start of the buffer. + */ +static void +xlog_recover_validate_buf_type( + struct xfs_mount *mp, + struct xfs_buf *bp, + xfs_buf_log_format_t *buf_f) +{ + struct xfs_da_blkinfo *info = bp->b_addr; + __uint32_t magic32; + __uint16_t magic16; + __uint16_t magicda; + + /* + * We can only do post recovery validation on items on CRC enabled + * fielsystems as we need to know when the buffer was written to be able + * to determine if we should have replayed the item. If we replay old + * metadata over a newer buffer, then it will enter a temporarily + * inconsistent state resulting in verification failures. Hence for now + * just avoid the verification stage for non-crc filesystems + */ + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + magic32 = be32_to_cpu(*(__be32 *)bp->b_addr); + magic16 = be16_to_cpu(*(__be16*)bp->b_addr); + magicda = be16_to_cpu(info->magic); + switch (xfs_blft_from_flags(buf_f)) { + case XFS_BLFT_BTREE_BUF: + switch (magic32) { + case XFS_ABTB_CRC_MAGIC: + case XFS_ABTC_CRC_MAGIC: + case XFS_ABTB_MAGIC: + case XFS_ABTC_MAGIC: + bp->b_ops = &xfs_allocbt_buf_ops; + break; + case XFS_IBT_CRC_MAGIC: + case XFS_FIBT_CRC_MAGIC: + case XFS_IBT_MAGIC: + case XFS_FIBT_MAGIC: + bp->b_ops = &xfs_inobt_buf_ops; + break; + case XFS_BMAP_CRC_MAGIC: + case XFS_BMAP_MAGIC: + bp->b_ops = &xfs_bmbt_buf_ops; + break; + default: + xfs_warn(mp, "Bad btree block magic!"); + ASSERT(0); + break; + } + break; + case XFS_BLFT_AGF_BUF: + if (magic32 != XFS_AGF_MAGIC) { + xfs_warn(mp, "Bad AGF block magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_agf_buf_ops; + break; + case XFS_BLFT_AGFL_BUF: + if (magic32 != XFS_AGFL_MAGIC) { + xfs_warn(mp, "Bad AGFL block magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_agfl_buf_ops; + break; + case XFS_BLFT_AGI_BUF: + if (magic32 != XFS_AGI_MAGIC) { + xfs_warn(mp, "Bad AGI block magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_agi_buf_ops; + break; + case XFS_BLFT_UDQUOT_BUF: + case XFS_BLFT_PDQUOT_BUF: + case XFS_BLFT_GDQUOT_BUF: +#ifdef CONFIG_XFS_QUOTA + if (magic16 != XFS_DQUOT_MAGIC) { + xfs_warn(mp, "Bad DQUOT block magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_dquot_buf_ops; +#else + xfs_alert(mp, + "Trying to recover dquots without QUOTA support built in!"); + ASSERT(0); +#endif + break; + case XFS_BLFT_DINO_BUF: + if (magic16 != XFS_DINODE_MAGIC) { + xfs_warn(mp, "Bad INODE block magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_inode_buf_ops; + break; + case XFS_BLFT_SYMLINK_BUF: + if (magic32 != XFS_SYMLINK_MAGIC) { + xfs_warn(mp, "Bad symlink block magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_symlink_buf_ops; + break; + case XFS_BLFT_DIR_BLOCK_BUF: + if (magic32 != XFS_DIR2_BLOCK_MAGIC && + magic32 != XFS_DIR3_BLOCK_MAGIC) { + xfs_warn(mp, "Bad dir block magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_dir3_block_buf_ops; + break; + case XFS_BLFT_DIR_DATA_BUF: + if (magic32 != XFS_DIR2_DATA_MAGIC && + magic32 != XFS_DIR3_DATA_MAGIC) { + xfs_warn(mp, "Bad dir data magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_dir3_data_buf_ops; + break; + case XFS_BLFT_DIR_FREE_BUF: + if (magic32 != XFS_DIR2_FREE_MAGIC && + magic32 != XFS_DIR3_FREE_MAGIC) { + xfs_warn(mp, "Bad dir3 free magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_dir3_free_buf_ops; + break; + case XFS_BLFT_DIR_LEAF1_BUF: + if (magicda != XFS_DIR2_LEAF1_MAGIC && + magicda != XFS_DIR3_LEAF1_MAGIC) { + xfs_warn(mp, "Bad dir leaf1 magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_dir3_leaf1_buf_ops; + break; + case XFS_BLFT_DIR_LEAFN_BUF: + if (magicda != XFS_DIR2_LEAFN_MAGIC && + magicda != XFS_DIR3_LEAFN_MAGIC) { + xfs_warn(mp, "Bad dir leafn magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_dir3_leafn_buf_ops; + break; + case XFS_BLFT_DA_NODE_BUF: + if (magicda != XFS_DA_NODE_MAGIC && + magicda != XFS_DA3_NODE_MAGIC) { + xfs_warn(mp, "Bad da node magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_da3_node_buf_ops; + break; + case XFS_BLFT_ATTR_LEAF_BUF: + if (magicda != XFS_ATTR_LEAF_MAGIC && + magicda != XFS_ATTR3_LEAF_MAGIC) { + xfs_warn(mp, "Bad attr leaf magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_attr3_leaf_buf_ops; + break; + case XFS_BLFT_ATTR_RMT_BUF: + if (magic32 != XFS_ATTR3_RMT_MAGIC) { + xfs_warn(mp, "Bad attr remote magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_attr3_rmt_buf_ops; + break; + case XFS_BLFT_SB_BUF: + if (magic32 != XFS_SB_MAGIC) { + xfs_warn(mp, "Bad SB block magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_sb_buf_ops; + break; + default: + xfs_warn(mp, "Unknown buffer type %d!", + xfs_blft_from_flags(buf_f)); + break; + } +} + +/* + * Perform a 'normal' buffer recovery. Each logged region of the + * buffer should be copied over the corresponding region in the + * given buffer. The bitmap in the buf log format structure indicates + * where to place the logged data. + */ +STATIC void +xlog_recover_do_reg_buffer( + struct xfs_mount *mp, + xlog_recover_item_t *item, + struct xfs_buf *bp, + xfs_buf_log_format_t *buf_f) +{ + int i; + int bit; + int nbits; + int error; + + trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f); + + bit = 0; + i = 1; /* 0 is the buf format structure */ + while (1) { + bit = xfs_next_bit(buf_f->blf_data_map, + buf_f->blf_map_size, bit); + if (bit == -1) + break; + nbits = xfs_contig_bits(buf_f->blf_data_map, + buf_f->blf_map_size, bit); + ASSERT(nbits > 0); + ASSERT(item->ri_buf[i].i_addr != NULL); + ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0); + ASSERT(BBTOB(bp->b_io_length) >= + ((uint)bit << XFS_BLF_SHIFT) + (nbits << XFS_BLF_SHIFT)); + + /* + * The dirty regions logged in the buffer, even though + * contiguous, may span multiple chunks. This is because the + * dirty region may span a physical page boundary in a buffer + * and hence be split into two separate vectors for writing into + * the log. Hence we need to trim nbits back to the length of + * the current region being copied out of the log. + */ + if (item->ri_buf[i].i_len < (nbits << XFS_BLF_SHIFT)) + nbits = item->ri_buf[i].i_len >> XFS_BLF_SHIFT; + + /* + * Do a sanity check if this is a dquot buffer. Just checking + * the first dquot in the buffer should do. XXXThis is + * probably a good thing to do for other buf types also. + */ + error = 0; + if (buf_f->blf_flags & + (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) { + if (item->ri_buf[i].i_addr == NULL) { + xfs_alert(mp, + "XFS: NULL dquot in %s.", __func__); + goto next; + } + if (item->ri_buf[i].i_len < sizeof(xfs_disk_dquot_t)) { + xfs_alert(mp, + "XFS: dquot too small (%d) in %s.", + item->ri_buf[i].i_len, __func__); + goto next; + } + error = xfs_dqcheck(mp, item->ri_buf[i].i_addr, + -1, 0, XFS_QMOPT_DOWARN, + "dquot_buf_recover"); + if (error) + goto next; + } + + memcpy(xfs_buf_offset(bp, + (uint)bit << XFS_BLF_SHIFT), /* dest */ + item->ri_buf[i].i_addr, /* source */ + nbits<ri_total); + + xlog_recover_validate_buf_type(mp, bp, buf_f); +} + +/* + * Perform a dquot buffer recovery. + * Simple algorithm: if we have found a QUOTAOFF log item of the same type + * (ie. USR or GRP), then just toss this buffer away; don't recover it. + * Else, treat it as a regular buffer and do recovery. + * + * Return false if the buffer was tossed and true if we recovered the buffer to + * indicate to the caller if the buffer needs writing. + */ +STATIC bool +xlog_recover_do_dquot_buffer( + struct xfs_mount *mp, + struct xlog *log, + struct xlog_recover_item *item, + struct xfs_buf *bp, + struct xfs_buf_log_format *buf_f) +{ + uint type; + + trace_xfs_log_recover_buf_dquot_buf(log, buf_f); + + /* + * Filesystems are required to send in quota flags at mount time. + */ + if (!mp->m_qflags) + return false; + + type = 0; + if (buf_f->blf_flags & XFS_BLF_UDQUOT_BUF) + type |= XFS_DQ_USER; + if (buf_f->blf_flags & XFS_BLF_PDQUOT_BUF) + type |= XFS_DQ_PROJ; + if (buf_f->blf_flags & XFS_BLF_GDQUOT_BUF) + type |= XFS_DQ_GROUP; + /* + * This type of quotas was turned off, so ignore this buffer + */ + if (log->l_quotaoffs_flag & type) + return false; + + xlog_recover_do_reg_buffer(mp, item, bp, buf_f); + return true; +} + +/* + * This routine replays a modification made to a buffer at runtime. + * There are actually two types of buffer, regular and inode, which + * are handled differently. Inode buffers are handled differently + * in that we only recover a specific set of data from them, namely + * the inode di_next_unlinked fields. This is because all other inode + * data is actually logged via inode records and any data we replay + * here which overlaps that may be stale. + * + * When meta-data buffers are freed at run time we log a buffer item + * with the XFS_BLF_CANCEL bit set to indicate that previous copies + * of the buffer in the log should not be replayed at recovery time. + * This is so that if the blocks covered by the buffer are reused for + * file data before we crash we don't end up replaying old, freed + * meta-data into a user's file. + * + * To handle the cancellation of buffer log items, we make two passes + * over the log during recovery. During the first we build a table of + * those buffers which have been cancelled, and during the second we + * only replay those buffers which do not have corresponding cancel + * records in the table. See xlog_recover_buffer_pass[1,2] above + * for more details on the implementation of the table of cancel records. + */ +STATIC int +xlog_recover_buffer_pass2( + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item, + xfs_lsn_t current_lsn) +{ + xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr; + xfs_mount_t *mp = log->l_mp; + xfs_buf_t *bp; + int error; + uint buf_flags; + xfs_lsn_t lsn; + + /* + * In this pass we only want to recover all the buffers which have + * not been cancelled and are not cancellation buffers themselves. + */ + if (xlog_check_buffer_cancelled(log, buf_f->blf_blkno, + buf_f->blf_len, buf_f->blf_flags)) { + trace_xfs_log_recover_buf_cancel(log, buf_f); + return 0; + } + + trace_xfs_log_recover_buf_recover(log, buf_f); + + buf_flags = 0; + if (buf_f->blf_flags & XFS_BLF_INODE_BUF) + buf_flags |= XBF_UNMAPPED; + + bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len, + buf_flags, NULL); + if (!bp) + return -ENOMEM; + error = bp->b_error; + if (error) { + xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#1)"); + goto out_release; + } + + /* + * Recover the buffer only if we get an LSN from it and it's less than + * the lsn of the transaction we are replaying. + * + * Note that we have to be extremely careful of readahead here. + * Readahead does not attach verfiers to the buffers so if we don't + * actually do any replay after readahead because of the LSN we found + * in the buffer if more recent than that current transaction then we + * need to attach the verifier directly. Failure to do so can lead to + * future recovery actions (e.g. EFI and unlinked list recovery) can + * operate on the buffers and they won't get the verifier attached. This + * can lead to blocks on disk having the correct content but a stale + * CRC. + * + * It is safe to assume these clean buffers are currently up to date. + * If the buffer is dirtied by a later transaction being replayed, then + * the verifier will be reset to match whatever recover turns that + * buffer into. + */ + lsn = xlog_recover_get_buf_lsn(mp, bp); + if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) { + xlog_recover_validate_buf_type(mp, bp, buf_f); + goto out_release; + } + + if (buf_f->blf_flags & XFS_BLF_INODE_BUF) { + error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f); + if (error) + goto out_release; + } else if (buf_f->blf_flags & + (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) { + bool dirty; + + dirty = xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f); + if (!dirty) + goto out_release; + } else { + xlog_recover_do_reg_buffer(mp, item, bp, buf_f); + } + + /* + * Perform delayed write on the buffer. Asynchronous writes will be + * slower when taking into account all the buffers to be flushed. + * + * Also make sure that only inode buffers with good sizes stay in + * the buffer cache. The kernel moves inodes in buffers of 1 block + * or mp->m_inode_cluster_size bytes, whichever is bigger. The inode + * buffers in the log can be a different size if the log was generated + * by an older kernel using unclustered inode buffers or a newer kernel + * running with a different inode cluster size. Regardless, if the + * the inode buffer size isn't MAX(blocksize, mp->m_inode_cluster_size) + * for *our* value of mp->m_inode_cluster_size, then we need to keep + * the buffer out of the buffer cache so that the buffer won't + * overlap with future reads of those inodes. + */ + if (XFS_DINODE_MAGIC == + be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) && + (BBTOB(bp->b_io_length) != MAX(log->l_mp->m_sb.sb_blocksize, + (__uint32_t)log->l_mp->m_inode_cluster_size))) { + xfs_buf_stale(bp); + error = xfs_bwrite(bp); + } else { + ASSERT(bp->b_target->bt_mount == mp); + bp->b_iodone = xlog_recover_iodone; + xfs_buf_delwri_queue(bp, buffer_list); + } + +out_release: + xfs_buf_relse(bp); + return error; +} + +/* + * Inode fork owner changes + * + * If we have been told that we have to reparent the inode fork, it's because an + * extent swap operation on a CRC enabled filesystem has been done and we are + * replaying it. We need to walk the BMBT of the appropriate fork and change the + * owners of it. + * + * The complexity here is that we don't have an inode context to work with, so + * after we've replayed the inode we need to instantiate one. This is where the + * fun begins. + * + * We are in the middle of log recovery, so we can't run transactions. That + * means we cannot use cache coherent inode instantiation via xfs_iget(), as + * that will result in the corresponding iput() running the inode through + * xfs_inactive(). If we've just replayed an inode core that changes the link + * count to zero (i.e. it's been unlinked), then xfs_inactive() will run + * transactions (bad!). + * + * So, to avoid this, we instantiate an inode directly from the inode core we've + * just recovered. We have the buffer still locked, and all we really need to + * instantiate is the inode core and the forks being modified. We can do this + * manually, then run the inode btree owner change, and then tear down the + * xfs_inode without having to run any transactions at all. + * + * Also, because we don't have a transaction context available here but need to + * gather all the buffers we modify for writeback so we pass the buffer_list + * instead for the operation to use. + */ + +STATIC int +xfs_recover_inode_owner_change( + struct xfs_mount *mp, + struct xfs_dinode *dip, + struct xfs_inode_log_format *in_f, + struct list_head *buffer_list) +{ + struct xfs_inode *ip; + int error; + + ASSERT(in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER)); + + ip = xfs_inode_alloc(mp, in_f->ilf_ino); + if (!ip) + return -ENOMEM; + + /* instantiate the inode */ + xfs_dinode_from_disk(&ip->i_d, dip); + ASSERT(ip->i_d.di_version >= 3); + + error = xfs_iformat_fork(ip, dip); + if (error) + goto out_free_ip; + + + if (in_f->ilf_fields & XFS_ILOG_DOWNER) { + ASSERT(in_f->ilf_fields & XFS_ILOG_DBROOT); + error = xfs_bmbt_change_owner(NULL, ip, XFS_DATA_FORK, + ip->i_ino, buffer_list); + if (error) + goto out_free_ip; + } + + if (in_f->ilf_fields & XFS_ILOG_AOWNER) { + ASSERT(in_f->ilf_fields & XFS_ILOG_ABROOT); + error = xfs_bmbt_change_owner(NULL, ip, XFS_ATTR_FORK, + ip->i_ino, buffer_list); + if (error) + goto out_free_ip; + } + +out_free_ip: + xfs_inode_free(ip); + return error; +} + +STATIC int +xlog_recover_inode_pass2( + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item, + xfs_lsn_t current_lsn) +{ + xfs_inode_log_format_t *in_f; + xfs_mount_t *mp = log->l_mp; + xfs_buf_t *bp; + xfs_dinode_t *dip; + int len; + xfs_caddr_t src; + xfs_caddr_t dest; + int error; + int attr_index; + uint fields; + xfs_icdinode_t *dicp; + uint isize; + int need_free = 0; + + if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) { + in_f = item->ri_buf[0].i_addr; + } else { + in_f = kmem_alloc(sizeof(xfs_inode_log_format_t), KM_SLEEP); + need_free = 1; + error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f); + if (error) + goto error; + } + + /* + * Inode buffers can be freed, look out for it, + * and do not replay the inode. + */ + if (xlog_check_buffer_cancelled(log, in_f->ilf_blkno, + in_f->ilf_len, 0)) { + error = 0; + trace_xfs_log_recover_inode_cancel(log, in_f); + goto error; + } + trace_xfs_log_recover_inode_recover(log, in_f); + + bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0, + &xfs_inode_buf_ops); + if (!bp) { + error = -ENOMEM; + goto error; + } + error = bp->b_error; + if (error) { + xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#2)"); + goto out_release; + } + ASSERT(in_f->ilf_fields & XFS_ILOG_CORE); + dip = (xfs_dinode_t *)xfs_buf_offset(bp, in_f->ilf_boffset); + + /* + * Make sure the place we're flushing out to really looks + * like an inode! + */ + if (unlikely(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))) { + xfs_alert(mp, + "%s: Bad inode magic number, dip = 0x%p, dino bp = 0x%p, ino = %Ld", + __func__, dip, bp, in_f->ilf_ino); + XFS_ERROR_REPORT("xlog_recover_inode_pass2(1)", + XFS_ERRLEVEL_LOW, mp); + error = -EFSCORRUPTED; + goto out_release; + } + dicp = item->ri_buf[1].i_addr; + if (unlikely(dicp->di_magic != XFS_DINODE_MAGIC)) { + xfs_alert(mp, + "%s: Bad inode log record, rec ptr 0x%p, ino %Ld", + __func__, item, in_f->ilf_ino); + XFS_ERROR_REPORT("xlog_recover_inode_pass2(2)", + XFS_ERRLEVEL_LOW, mp); + error = -EFSCORRUPTED; + goto out_release; + } + + /* + * If the inode has an LSN in it, recover the inode only if it's less + * than the lsn of the transaction we are replaying. Note: we still + * need to replay an owner change even though the inode is more recent + * than the transaction as there is no guarantee that all the btree + * blocks are more recent than this transaction, too. + */ + if (dip->di_version >= 3) { + xfs_lsn_t lsn = be64_to_cpu(dip->di_lsn); + + if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) { + trace_xfs_log_recover_inode_skip(log, in_f); + error = 0; + goto out_owner_change; + } + } + + /* + * di_flushiter is only valid for v1/2 inodes. All changes for v3 inodes + * are transactional and if ordering is necessary we can determine that + * more accurately by the LSN field in the V3 inode core. Don't trust + * the inode versions we might be changing them here - use the + * superblock flag to determine whether we need to look at di_flushiter + * to skip replay when the on disk inode is newer than the log one + */ + if (!xfs_sb_version_hascrc(&mp->m_sb) && + dicp->di_flushiter < be16_to_cpu(dip->di_flushiter)) { + /* + * Deal with the wrap case, DI_MAX_FLUSH is less + * than smaller numbers + */ + if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH && + dicp->di_flushiter < (DI_MAX_FLUSH >> 1)) { + /* do nothing */ + } else { + trace_xfs_log_recover_inode_skip(log, in_f); + error = 0; + goto out_release; + } + } + + /* Take the opportunity to reset the flush iteration count */ + dicp->di_flushiter = 0; + + if (unlikely(S_ISREG(dicp->di_mode))) { + if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) && + (dicp->di_format != XFS_DINODE_FMT_BTREE)) { + XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)", + XFS_ERRLEVEL_LOW, mp, dicp); + xfs_alert(mp, + "%s: Bad regular inode log record, rec ptr 0x%p, " + "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld", + __func__, item, dip, bp, in_f->ilf_ino); + error = -EFSCORRUPTED; + goto out_release; + } + } else if (unlikely(S_ISDIR(dicp->di_mode))) { + if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) && + (dicp->di_format != XFS_DINODE_FMT_BTREE) && + (dicp->di_format != XFS_DINODE_FMT_LOCAL)) { + XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)", + XFS_ERRLEVEL_LOW, mp, dicp); + xfs_alert(mp, + "%s: Bad dir inode log record, rec ptr 0x%p, " + "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld", + __func__, item, dip, bp, in_f->ilf_ino); + error = -EFSCORRUPTED; + goto out_release; + } + } + if (unlikely(dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks)){ + XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)", + XFS_ERRLEVEL_LOW, mp, dicp); + xfs_alert(mp, + "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, " + "dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld", + __func__, item, dip, bp, in_f->ilf_ino, + dicp->di_nextents + dicp->di_anextents, + dicp->di_nblocks); + error = -EFSCORRUPTED; + goto out_release; + } + if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) { + XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)", + XFS_ERRLEVEL_LOW, mp, dicp); + xfs_alert(mp, + "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, " + "dino bp 0x%p, ino %Ld, forkoff 0x%x", __func__, + item, dip, bp, in_f->ilf_ino, dicp->di_forkoff); + error = -EFSCORRUPTED; + goto out_release; + } + isize = xfs_icdinode_size(dicp->di_version); + if (unlikely(item->ri_buf[1].i_len > isize)) { + XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)", + XFS_ERRLEVEL_LOW, mp, dicp); + xfs_alert(mp, + "%s: Bad inode log record length %d, rec ptr 0x%p", + __func__, item->ri_buf[1].i_len, item); + error = -EFSCORRUPTED; + goto out_release; + } + + /* The core is in in-core format */ + xfs_dinode_to_disk(dip, dicp); + + /* the rest is in on-disk format */ + if (item->ri_buf[1].i_len > isize) { + memcpy((char *)dip + isize, + item->ri_buf[1].i_addr + isize, + item->ri_buf[1].i_len - isize); + } + + fields = in_f->ilf_fields; + switch (fields & (XFS_ILOG_DEV | XFS_ILOG_UUID)) { + case XFS_ILOG_DEV: + xfs_dinode_put_rdev(dip, in_f->ilf_u.ilfu_rdev); + break; + case XFS_ILOG_UUID: + memcpy(XFS_DFORK_DPTR(dip), + &in_f->ilf_u.ilfu_uuid, + sizeof(uuid_t)); + break; + } + + if (in_f->ilf_size == 2) + goto out_owner_change; + len = item->ri_buf[2].i_len; + src = item->ri_buf[2].i_addr; + ASSERT(in_f->ilf_size <= 4); + ASSERT((in_f->ilf_size == 3) || (fields & XFS_ILOG_AFORK)); + ASSERT(!(fields & XFS_ILOG_DFORK) || + (len == in_f->ilf_dsize)); + + switch (fields & XFS_ILOG_DFORK) { + case XFS_ILOG_DDATA: + case XFS_ILOG_DEXT: + memcpy(XFS_DFORK_DPTR(dip), src, len); + break; + + case XFS_ILOG_DBROOT: + xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src, len, + (xfs_bmdr_block_t *)XFS_DFORK_DPTR(dip), + XFS_DFORK_DSIZE(dip, mp)); + break; + + default: + /* + * There are no data fork flags set. + */ + ASSERT((fields & XFS_ILOG_DFORK) == 0); + break; + } + + /* + * If we logged any attribute data, recover it. There may or + * may not have been any other non-core data logged in this + * transaction. + */ + if (in_f->ilf_fields & XFS_ILOG_AFORK) { + if (in_f->ilf_fields & XFS_ILOG_DFORK) { + attr_index = 3; + } else { + attr_index = 2; + } + len = item->ri_buf[attr_index].i_len; + src = item->ri_buf[attr_index].i_addr; + ASSERT(len == in_f->ilf_asize); + + switch (in_f->ilf_fields & XFS_ILOG_AFORK) { + case XFS_ILOG_ADATA: + case XFS_ILOG_AEXT: + dest = XFS_DFORK_APTR(dip); + ASSERT(len <= XFS_DFORK_ASIZE(dip, mp)); + memcpy(dest, src, len); + break; + + case XFS_ILOG_ABROOT: + dest = XFS_DFORK_APTR(dip); + xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src, + len, (xfs_bmdr_block_t*)dest, + XFS_DFORK_ASIZE(dip, mp)); + break; + + default: + xfs_warn(log->l_mp, "%s: Invalid flag", __func__); + ASSERT(0); + error = -EIO; + goto out_release; + } + } + +out_owner_change: + if (in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER)) + error = xfs_recover_inode_owner_change(mp, dip, in_f, + buffer_list); + /* re-generate the checksum. */ + xfs_dinode_calc_crc(log->l_mp, dip); + + ASSERT(bp->b_target->bt_mount == mp); + bp->b_iodone = xlog_recover_iodone; + xfs_buf_delwri_queue(bp, buffer_list); + +out_release: + xfs_buf_relse(bp); +error: + if (need_free) + kmem_free(in_f); + return error; +} + +/* + * Recover QUOTAOFF records. We simply make a note of it in the xlog + * structure, so that we know not to do any dquot item or dquot buffer recovery, + * of that type. + */ +STATIC int +xlog_recover_quotaoff_pass1( + struct xlog *log, + struct xlog_recover_item *item) +{ + xfs_qoff_logformat_t *qoff_f = item->ri_buf[0].i_addr; + ASSERT(qoff_f); + + /* + * The logitem format's flag tells us if this was user quotaoff, + * group/project quotaoff or both. + */ + if (qoff_f->qf_flags & XFS_UQUOTA_ACCT) + log->l_quotaoffs_flag |= XFS_DQ_USER; + if (qoff_f->qf_flags & XFS_PQUOTA_ACCT) + log->l_quotaoffs_flag |= XFS_DQ_PROJ; + if (qoff_f->qf_flags & XFS_GQUOTA_ACCT) + log->l_quotaoffs_flag |= XFS_DQ_GROUP; + + return 0; +} + +/* + * Recover a dquot record + */ +STATIC int +xlog_recover_dquot_pass2( + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item, + xfs_lsn_t current_lsn) +{ + xfs_mount_t *mp = log->l_mp; + xfs_buf_t *bp; + struct xfs_disk_dquot *ddq, *recddq; + int error; + xfs_dq_logformat_t *dq_f; + uint type; + + + /* + * Filesystems are required to send in quota flags at mount time. + */ + if (mp->m_qflags == 0) + return 0; + + recddq = item->ri_buf[1].i_addr; + if (recddq == NULL) { + xfs_alert(log->l_mp, "NULL dquot in %s.", __func__); + return -EIO; + } + if (item->ri_buf[1].i_len < sizeof(xfs_disk_dquot_t)) { + xfs_alert(log->l_mp, "dquot too small (%d) in %s.", + item->ri_buf[1].i_len, __func__); + return -EIO; + } + + /* + * This type of quotas was turned off, so ignore this record. + */ + type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP); + ASSERT(type); + if (log->l_quotaoffs_flag & type) + return 0; + + /* + * At this point we know that quota was _not_ turned off. + * Since the mount flags are not indicating to us otherwise, this + * must mean that quota is on, and the dquot needs to be replayed. + * Remember that we may not have fully recovered the superblock yet, + * so we can't do the usual trick of looking at the SB quota bits. + * + * The other possibility, of course, is that the quota subsystem was + * removed since the last mount - ENOSYS. + */ + dq_f = item->ri_buf[0].i_addr; + ASSERT(dq_f); + error = xfs_dqcheck(mp, recddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN, + "xlog_recover_dquot_pass2 (log copy)"); + if (error) + return -EIO; + ASSERT(dq_f->qlf_len == 1); + + /* + * At this point we are assuming that the dquots have been allocated + * and hence the buffer has valid dquots stamped in it. It should, + * therefore, pass verifier validation. If the dquot is bad, then the + * we'll return an error here, so we don't need to specifically check + * the dquot in the buffer after the verifier has run. + */ + error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dq_f->qlf_blkno, + XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp, + &xfs_dquot_buf_ops); + if (error) + return error; + + ASSERT(bp); + ddq = (xfs_disk_dquot_t *)xfs_buf_offset(bp, dq_f->qlf_boffset); + + /* + * If the dquot has an LSN in it, recover the dquot only if it's less + * than the lsn of the transaction we are replaying. + */ + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_dqblk *dqb = (struct xfs_dqblk *)ddq; + xfs_lsn_t lsn = be64_to_cpu(dqb->dd_lsn); + + if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) { + goto out_release; + } + } + + memcpy(ddq, recddq, item->ri_buf[1].i_len); + if (xfs_sb_version_hascrc(&mp->m_sb)) { + xfs_update_cksum((char *)ddq, sizeof(struct xfs_dqblk), + XFS_DQUOT_CRC_OFF); + } + + ASSERT(dq_f->qlf_size == 2); + ASSERT(bp->b_target->bt_mount == mp); + bp->b_iodone = xlog_recover_iodone; + xfs_buf_delwri_queue(bp, buffer_list); + +out_release: + xfs_buf_relse(bp); + return 0; +} + +/* + * This routine is called to create an in-core extent free intent + * item from the efi format structure which was logged on disk. + * It allocates an in-core efi, copies the extents from the format + * structure into it, and adds the efi to the AIL with the given + * LSN. + */ +STATIC int +xlog_recover_efi_pass2( + struct xlog *log, + struct xlog_recover_item *item, + xfs_lsn_t lsn) +{ + int error; + xfs_mount_t *mp = log->l_mp; + xfs_efi_log_item_t *efip; + xfs_efi_log_format_t *efi_formatp; + + efi_formatp = item->ri_buf[0].i_addr; + + efip = xfs_efi_init(mp, efi_formatp->efi_nextents); + if ((error = xfs_efi_copy_format(&(item->ri_buf[0]), + &(efip->efi_format)))) { + xfs_efi_item_free(efip); + return error; + } + atomic_set(&efip->efi_next_extent, efi_formatp->efi_nextents); + + spin_lock(&log->l_ailp->xa_lock); + /* + * xfs_trans_ail_update() drops the AIL lock. + */ + xfs_trans_ail_update(log->l_ailp, &efip->efi_item, lsn); + return 0; +} + + +/* + * This routine is called when an efd format structure is found in + * a committed transaction in the log. It's purpose is to cancel + * the corresponding efi if it was still in the log. To do this + * it searches the AIL for the efi with an id equal to that in the + * efd format structure. If we find it, we remove the efi from the + * AIL and free it. + */ +STATIC int +xlog_recover_efd_pass2( + struct xlog *log, + struct xlog_recover_item *item) +{ + xfs_efd_log_format_t *efd_formatp; + xfs_efi_log_item_t *efip = NULL; + xfs_log_item_t *lip; + __uint64_t efi_id; + struct xfs_ail_cursor cur; + struct xfs_ail *ailp = log->l_ailp; + + efd_formatp = item->ri_buf[0].i_addr; + ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) + + ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) || + (item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_64_t) + + ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_64_t))))); + efi_id = efd_formatp->efd_efi_id; + + /* + * Search for the efi with the id in the efd format structure + * in the AIL. + */ + spin_lock(&ailp->xa_lock); + lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); + while (lip != NULL) { + if (lip->li_type == XFS_LI_EFI) { + efip = (xfs_efi_log_item_t *)lip; + if (efip->efi_format.efi_id == efi_id) { + /* + * xfs_trans_ail_delete() drops the + * AIL lock. + */ + xfs_trans_ail_delete(ailp, lip, + SHUTDOWN_CORRUPT_INCORE); + xfs_efi_item_free(efip); + spin_lock(&ailp->xa_lock); + break; + } + } + lip = xfs_trans_ail_cursor_next(ailp, &cur); + } + xfs_trans_ail_cursor_done(&cur); + spin_unlock(&ailp->xa_lock); + + return 0; +} + +/* + * This routine is called when an inode create format structure is found in a + * committed transaction in the log. It's purpose is to initialise the inodes + * being allocated on disk. This requires us to get inode cluster buffers that + * match the range to be intialised, stamped with inode templates and written + * by delayed write so that subsequent modifications will hit the cached buffer + * and only need writing out at the end of recovery. + */ +STATIC int +xlog_recover_do_icreate_pass2( + struct xlog *log, + struct list_head *buffer_list, + xlog_recover_item_t *item) +{ + struct xfs_mount *mp = log->l_mp; + struct xfs_icreate_log *icl; + xfs_agnumber_t agno; + xfs_agblock_t agbno; + unsigned int count; + unsigned int isize; + xfs_agblock_t length; + + icl = (struct xfs_icreate_log *)item->ri_buf[0].i_addr; + if (icl->icl_type != XFS_LI_ICREATE) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad type"); + return -EINVAL; + } + + if (icl->icl_size != 1) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad icl size"); + return -EINVAL; + } + + agno = be32_to_cpu(icl->icl_ag); + if (agno >= mp->m_sb.sb_agcount) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agno"); + return -EINVAL; + } + agbno = be32_to_cpu(icl->icl_agbno); + if (!agbno || agbno == NULLAGBLOCK || agbno >= mp->m_sb.sb_agblocks) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agbno"); + return -EINVAL; + } + isize = be32_to_cpu(icl->icl_isize); + if (isize != mp->m_sb.sb_inodesize) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad isize"); + return -EINVAL; + } + count = be32_to_cpu(icl->icl_count); + if (!count) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count"); + return -EINVAL; + } + length = be32_to_cpu(icl->icl_length); + if (!length || length >= mp->m_sb.sb_agblocks) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad length"); + return -EINVAL; + } + + /* existing allocation is fixed value */ + ASSERT(count == mp->m_ialloc_inos); + ASSERT(length == mp->m_ialloc_blks); + if (count != mp->m_ialloc_inos || + length != mp->m_ialloc_blks) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count 2"); + return -EINVAL; + } + + /* + * Inode buffers can be freed. Do not replay the inode initialisation as + * we could be overwriting something written after this inode buffer was + * cancelled. + * + * XXX: we need to iterate all buffers and only init those that are not + * cancelled. I think that a more fine grained factoring of + * xfs_ialloc_inode_init may be appropriate here to enable this to be + * done easily. + */ + if (xlog_check_buffer_cancelled(log, + XFS_AGB_TO_DADDR(mp, agno, agbno), length, 0)) + return 0; + + xfs_ialloc_inode_init(mp, NULL, buffer_list, agno, agbno, length, + be32_to_cpu(icl->icl_gen)); + return 0; +} + +STATIC void +xlog_recover_buffer_ra_pass2( + struct xlog *log, + struct xlog_recover_item *item) +{ + struct xfs_buf_log_format *buf_f = item->ri_buf[0].i_addr; + struct xfs_mount *mp = log->l_mp; + + if (xlog_peek_buffer_cancelled(log, buf_f->blf_blkno, + buf_f->blf_len, buf_f->blf_flags)) { + return; + } + + xfs_buf_readahead(mp->m_ddev_targp, buf_f->blf_blkno, + buf_f->blf_len, NULL); +} + +STATIC void +xlog_recover_inode_ra_pass2( + struct xlog *log, + struct xlog_recover_item *item) +{ + struct xfs_inode_log_format ilf_buf; + struct xfs_inode_log_format *ilfp; + struct xfs_mount *mp = log->l_mp; + int error; + + if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) { + ilfp = item->ri_buf[0].i_addr; + } else { + ilfp = &ilf_buf; + memset(ilfp, 0, sizeof(*ilfp)); + error = xfs_inode_item_format_convert(&item->ri_buf[0], ilfp); + if (error) + return; + } + + if (xlog_peek_buffer_cancelled(log, ilfp->ilf_blkno, ilfp->ilf_len, 0)) + return; + + xfs_buf_readahead(mp->m_ddev_targp, ilfp->ilf_blkno, + ilfp->ilf_len, &xfs_inode_buf_ra_ops); +} + +STATIC void +xlog_recover_dquot_ra_pass2( + struct xlog *log, + struct xlog_recover_item *item) +{ + struct xfs_mount *mp = log->l_mp; + struct xfs_disk_dquot *recddq; + struct xfs_dq_logformat *dq_f; + uint type; + + + if (mp->m_qflags == 0) + return; + + recddq = item->ri_buf[1].i_addr; + if (recddq == NULL) + return; + if (item->ri_buf[1].i_len < sizeof(struct xfs_disk_dquot)) + return; + + type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP); + ASSERT(type); + if (log->l_quotaoffs_flag & type) + return; + + dq_f = item->ri_buf[0].i_addr; + ASSERT(dq_f); + ASSERT(dq_f->qlf_len == 1); + + xfs_buf_readahead(mp->m_ddev_targp, dq_f->qlf_blkno, + XFS_FSB_TO_BB(mp, dq_f->qlf_len), NULL); +} + +STATIC void +xlog_recover_ra_pass2( + struct xlog *log, + struct xlog_recover_item *item) +{ + switch (ITEM_TYPE(item)) { + case XFS_LI_BUF: + xlog_recover_buffer_ra_pass2(log, item); + break; + case XFS_LI_INODE: + xlog_recover_inode_ra_pass2(log, item); + break; + case XFS_LI_DQUOT: + xlog_recover_dquot_ra_pass2(log, item); + break; + case XFS_LI_EFI: + case XFS_LI_EFD: + case XFS_LI_QUOTAOFF: + default: + break; + } +} + +STATIC int +xlog_recover_commit_pass1( + struct xlog *log, + struct xlog_recover *trans, + struct xlog_recover_item *item) +{ + trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS1); + + switch (ITEM_TYPE(item)) { + case XFS_LI_BUF: + return xlog_recover_buffer_pass1(log, item); + case XFS_LI_QUOTAOFF: + return xlog_recover_quotaoff_pass1(log, item); + case XFS_LI_INODE: + case XFS_LI_EFI: + case XFS_LI_EFD: + case XFS_LI_DQUOT: + case XFS_LI_ICREATE: + /* nothing to do in pass 1 */ + return 0; + default: + xfs_warn(log->l_mp, "%s: invalid item type (%d)", + __func__, ITEM_TYPE(item)); + ASSERT(0); + return -EIO; + } +} + +STATIC int +xlog_recover_commit_pass2( + struct xlog *log, + struct xlog_recover *trans, + struct list_head *buffer_list, + struct xlog_recover_item *item) +{ + trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS2); + + switch (ITEM_TYPE(item)) { + case XFS_LI_BUF: + return xlog_recover_buffer_pass2(log, buffer_list, item, + trans->r_lsn); + case XFS_LI_INODE: + return xlog_recover_inode_pass2(log, buffer_list, item, + trans->r_lsn); + case XFS_LI_EFI: + return xlog_recover_efi_pass2(log, item, trans->r_lsn); + case XFS_LI_EFD: + return xlog_recover_efd_pass2(log, item); + case XFS_LI_DQUOT: + return xlog_recover_dquot_pass2(log, buffer_list, item, + trans->r_lsn); + case XFS_LI_ICREATE: + return xlog_recover_do_icreate_pass2(log, buffer_list, item); + case XFS_LI_QUOTAOFF: + /* nothing to do in pass2 */ + return 0; + default: + xfs_warn(log->l_mp, "%s: invalid item type (%d)", + __func__, ITEM_TYPE(item)); + ASSERT(0); + return -EIO; + } +} + +STATIC int +xlog_recover_items_pass2( + struct xlog *log, + struct xlog_recover *trans, + struct list_head *buffer_list, + struct list_head *item_list) +{ + struct xlog_recover_item *item; + int error = 0; + + list_for_each_entry(item, item_list, ri_list) { + error = xlog_recover_commit_pass2(log, trans, + buffer_list, item); + if (error) + return error; + } + + return error; +} + +/* + * Perform the transaction. + * + * If the transaction modifies a buffer or inode, do it now. Otherwise, + * EFIs and EFDs get queued up by adding entries into the AIL for them. + */ +STATIC int +xlog_recover_commit_trans( + struct xlog *log, + struct xlog_recover *trans, + int pass) +{ + int error = 0; + int error2; + int items_queued = 0; + struct xlog_recover_item *item; + struct xlog_recover_item *next; + LIST_HEAD (buffer_list); + LIST_HEAD (ra_list); + LIST_HEAD (done_list); + + #define XLOG_RECOVER_COMMIT_QUEUE_MAX 100 + + hlist_del(&trans->r_list); + + error = xlog_recover_reorder_trans(log, trans, pass); + if (error) + return error; + + list_for_each_entry_safe(item, next, &trans->r_itemq, ri_list) { + switch (pass) { + case XLOG_RECOVER_PASS1: + error = xlog_recover_commit_pass1(log, trans, item); + break; + case XLOG_RECOVER_PASS2: + xlog_recover_ra_pass2(log, item); + list_move_tail(&item->ri_list, &ra_list); + items_queued++; + if (items_queued >= XLOG_RECOVER_COMMIT_QUEUE_MAX) { + error = xlog_recover_items_pass2(log, trans, + &buffer_list, &ra_list); + list_splice_tail_init(&ra_list, &done_list); + items_queued = 0; + } + + break; + default: + ASSERT(0); + } + + if (error) + goto out; + } + +out: + if (!list_empty(&ra_list)) { + if (!error) + error = xlog_recover_items_pass2(log, trans, + &buffer_list, &ra_list); + list_splice_tail_init(&ra_list, &done_list); + } + + if (!list_empty(&done_list)) + list_splice_init(&done_list, &trans->r_itemq); + + error2 = xfs_buf_delwri_submit(&buffer_list); + return error ? error : error2; +} + +STATIC void +xlog_recover_add_item( + struct list_head *head) +{ + xlog_recover_item_t *item; + + item = kmem_zalloc(sizeof(xlog_recover_item_t), KM_SLEEP); + INIT_LIST_HEAD(&item->ri_list); + list_add_tail(&item->ri_list, head); +} + +STATIC int +xlog_recover_add_to_cont_trans( + struct xlog *log, + struct xlog_recover *trans, + xfs_caddr_t dp, + int len) +{ + xlog_recover_item_t *item; + xfs_caddr_t ptr, old_ptr; + int old_len; + + if (list_empty(&trans->r_itemq)) { + /* finish copying rest of trans header */ + xlog_recover_add_item(&trans->r_itemq); + ptr = (xfs_caddr_t) &trans->r_theader + + sizeof(xfs_trans_header_t) - len; + memcpy(ptr, dp, len); + return 0; + } + /* take the tail entry */ + item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list); + + old_ptr = item->ri_buf[item->ri_cnt-1].i_addr; + old_len = item->ri_buf[item->ri_cnt-1].i_len; + + ptr = kmem_realloc(old_ptr, len+old_len, old_len, KM_SLEEP); + memcpy(&ptr[old_len], dp, len); + item->ri_buf[item->ri_cnt-1].i_len += len; + item->ri_buf[item->ri_cnt-1].i_addr = ptr; + trace_xfs_log_recover_item_add_cont(log, trans, item, 0); + return 0; +} + +/* + * The next region to add is the start of a new region. It could be + * a whole region or it could be the first part of a new region. Because + * of this, the assumption here is that the type and size fields of all + * format structures fit into the first 32 bits of the structure. + * + * This works because all regions must be 32 bit aligned. Therefore, we + * either have both fields or we have neither field. In the case we have + * neither field, the data part of the region is zero length. We only have + * a log_op_header and can throw away the header since a new one will appear + * later. If we have at least 4 bytes, then we can determine how many regions + * will appear in the current log item. + */ +STATIC int +xlog_recover_add_to_trans( + struct xlog *log, + struct xlog_recover *trans, + xfs_caddr_t dp, + int len) +{ + xfs_inode_log_format_t *in_f; /* any will do */ + xlog_recover_item_t *item; + xfs_caddr_t ptr; + + if (!len) + return 0; + if (list_empty(&trans->r_itemq)) { + /* we need to catch log corruptions here */ + if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) { + xfs_warn(log->l_mp, "%s: bad header magic number", + __func__); + ASSERT(0); + return -EIO; + } + if (len == sizeof(xfs_trans_header_t)) + xlog_recover_add_item(&trans->r_itemq); + memcpy(&trans->r_theader, dp, len); + return 0; + } + + ptr = kmem_alloc(len, KM_SLEEP); + memcpy(ptr, dp, len); + in_f = (xfs_inode_log_format_t *)ptr; + + /* take the tail entry */ + item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list); + if (item->ri_total != 0 && + item->ri_total == item->ri_cnt) { + /* tail item is in use, get a new one */ + xlog_recover_add_item(&trans->r_itemq); + item = list_entry(trans->r_itemq.prev, + xlog_recover_item_t, ri_list); + } + + if (item->ri_total == 0) { /* first region to be added */ + if (in_f->ilf_size == 0 || + in_f->ilf_size > XLOG_MAX_REGIONS_IN_ITEM) { + xfs_warn(log->l_mp, + "bad number of regions (%d) in inode log format", + in_f->ilf_size); + ASSERT(0); + kmem_free(ptr); + return -EIO; + } + + item->ri_total = in_f->ilf_size; + item->ri_buf = + kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t), + KM_SLEEP); + } + ASSERT(item->ri_total > item->ri_cnt); + /* Description region is ri_buf[0] */ + item->ri_buf[item->ri_cnt].i_addr = ptr; + item->ri_buf[item->ri_cnt].i_len = len; + item->ri_cnt++; + trace_xfs_log_recover_item_add(log, trans, item, 0); + return 0; +} + +/* + * Free up any resources allocated by the transaction + * + * Remember that EFIs, EFDs, and IUNLINKs are handled later. + */ +STATIC void +xlog_recover_free_trans( + struct xlog_recover *trans) +{ + xlog_recover_item_t *item, *n; + int i; + + list_for_each_entry_safe(item, n, &trans->r_itemq, ri_list) { + /* Free the regions in the item. */ + list_del(&item->ri_list); + for (i = 0; i < item->ri_cnt; i++) + kmem_free(item->ri_buf[i].i_addr); + /* Free the item itself */ + kmem_free(item->ri_buf); + kmem_free(item); + } + /* Free the transaction recover structure */ + kmem_free(trans); +} + +/* + * On error or completion, trans is freed. + */ +STATIC int +xlog_recovery_process_trans( + struct xlog *log, + struct xlog_recover *trans, + xfs_caddr_t dp, + unsigned int len, + unsigned int flags, + int pass) +{ + int error = 0; + bool freeit = false; + + /* mask off ophdr transaction container flags */ + flags &= ~XLOG_END_TRANS; + if (flags & XLOG_WAS_CONT_TRANS) + flags &= ~XLOG_CONTINUE_TRANS; + + /* + * Callees must not free the trans structure. We'll decide if we need to + * free it or not based on the operation being done and it's result. + */ + switch (flags) { + /* expected flag values */ + case 0: + case XLOG_CONTINUE_TRANS: + error = xlog_recover_add_to_trans(log, trans, dp, len); + break; + case XLOG_WAS_CONT_TRANS: + error = xlog_recover_add_to_cont_trans(log, trans, dp, len); + break; + case XLOG_COMMIT_TRANS: + error = xlog_recover_commit_trans(log, trans, pass); + /* success or fail, we are now done with this transaction. */ + freeit = true; + break; + + /* unexpected flag values */ + case XLOG_UNMOUNT_TRANS: + /* just skip trans */ + xfs_warn(log->l_mp, "%s: Unmount LR", __func__); + freeit = true; + break; + case XLOG_START_TRANS: + default: + xfs_warn(log->l_mp, "%s: bad flag 0x%x", __func__, flags); + ASSERT(0); + error = -EIO; + break; + } + if (error || freeit) + xlog_recover_free_trans(trans); + return error; +} + +/* + * Lookup the transaction recovery structure associated with the ID in the + * current ophdr. If the transaction doesn't exist and the start flag is set in + * the ophdr, then allocate a new transaction for future ID matches to find. + * Either way, return what we found during the lookup - an existing transaction + * or nothing. + */ +STATIC struct xlog_recover * +xlog_recover_ophdr_to_trans( + struct hlist_head rhash[], + struct xlog_rec_header *rhead, + struct xlog_op_header *ohead) +{ + struct xlog_recover *trans; + xlog_tid_t tid; + struct hlist_head *rhp; + + tid = be32_to_cpu(ohead->oh_tid); + rhp = &rhash[XLOG_RHASH(tid)]; + hlist_for_each_entry(trans, rhp, r_list) { + if (trans->r_log_tid == tid) + return trans; + } + + /* + * skip over non-start transaction headers - we could be + * processing slack space before the next transaction starts + */ + if (!(ohead->oh_flags & XLOG_START_TRANS)) + return NULL; + + ASSERT(be32_to_cpu(ohead->oh_len) == 0); + + /* + * This is a new transaction so allocate a new recovery container to + * hold the recovery ops that will follow. + */ + trans = kmem_zalloc(sizeof(struct xlog_recover), KM_SLEEP); + trans->r_log_tid = tid; + trans->r_lsn = be64_to_cpu(rhead->h_lsn); + INIT_LIST_HEAD(&trans->r_itemq); + INIT_HLIST_NODE(&trans->r_list); + hlist_add_head(&trans->r_list, rhp); + + /* + * Nothing more to do for this ophdr. Items to be added to this new + * transaction will be in subsequent ophdr containers. + */ + return NULL; +} + +STATIC int +xlog_recover_process_ophdr( + struct xlog *log, + struct hlist_head rhash[], + struct xlog_rec_header *rhead, + struct xlog_op_header *ohead, + xfs_caddr_t dp, + xfs_caddr_t end, + int pass) +{ + struct xlog_recover *trans; + unsigned int len; + + /* Do we understand who wrote this op? */ + if (ohead->oh_clientid != XFS_TRANSACTION && + ohead->oh_clientid != XFS_LOG) { + xfs_warn(log->l_mp, "%s: bad clientid 0x%x", + __func__, ohead->oh_clientid); + ASSERT(0); + return -EIO; + } + + /* + * Check the ophdr contains all the data it is supposed to contain. + */ + len = be32_to_cpu(ohead->oh_len); + if (dp + len > end) { + xfs_warn(log->l_mp, "%s: bad length 0x%x", __func__, len); + WARN_ON(1); + return -EIO; + } + + trans = xlog_recover_ophdr_to_trans(rhash, rhead, ohead); + if (!trans) { + /* nothing to do, so skip over this ophdr */ + return 0; + } + + return xlog_recovery_process_trans(log, trans, dp, len, + ohead->oh_flags, pass); +} + +/* + * There are two valid states of the r_state field. 0 indicates that the + * transaction structure is in a normal state. We have either seen the + * start of the transaction or the last operation we added was not a partial + * operation. If the last operation we added to the transaction was a + * partial operation, we need to mark r_state with XLOG_WAS_CONT_TRANS. + * + * NOTE: skip LRs with 0 data length. + */ +STATIC int +xlog_recover_process_data( + struct xlog *log, + struct hlist_head rhash[], + struct xlog_rec_header *rhead, + xfs_caddr_t dp, + int pass) +{ + struct xlog_op_header *ohead; + xfs_caddr_t end; + int num_logops; + int error; + + end = dp + be32_to_cpu(rhead->h_len); + num_logops = be32_to_cpu(rhead->h_num_logops); + + /* check the log format matches our own - else we can't recover */ + if (xlog_header_check_recover(log->l_mp, rhead)) + return -EIO; + + while ((dp < end) && num_logops) { + + ohead = (struct xlog_op_header *)dp; + dp += sizeof(*ohead); + ASSERT(dp <= end); + + /* errors will abort recovery */ + error = xlog_recover_process_ophdr(log, rhash, rhead, ohead, + dp, end, pass); + if (error) + return error; + + dp += be32_to_cpu(ohead->oh_len); + num_logops--; + } + return 0; +} + +/* + * Process an extent free intent item that was recovered from + * the log. We need to free the extents that it describes. + */ +STATIC int +xlog_recover_process_efi( + xfs_mount_t *mp, + xfs_efi_log_item_t *efip) +{ + xfs_efd_log_item_t *efdp; + xfs_trans_t *tp; + int i; + int error = 0; + xfs_extent_t *extp; + xfs_fsblock_t startblock_fsb; + + ASSERT(!test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)); + + /* + * First check the validity of the extents described by the + * EFI. If any are bad, then assume that all are bad and + * just toss the EFI. + */ + for (i = 0; i < efip->efi_format.efi_nextents; i++) { + extp = &(efip->efi_format.efi_extents[i]); + startblock_fsb = XFS_BB_TO_FSB(mp, + XFS_FSB_TO_DADDR(mp, extp->ext_start)); + if ((startblock_fsb == 0) || + (extp->ext_len == 0) || + (startblock_fsb >= mp->m_sb.sb_dblocks) || + (extp->ext_len >= mp->m_sb.sb_agblocks)) { + /* + * This will pull the EFI from the AIL and + * free the memory associated with it. + */ + set_bit(XFS_EFI_RECOVERED, &efip->efi_flags); + xfs_efi_release(efip, efip->efi_format.efi_nextents); + return -EIO; + } + } + + tp = xfs_trans_alloc(mp, 0); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); + if (error) + goto abort_error; + efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents); + + for (i = 0; i < efip->efi_format.efi_nextents; i++) { + extp = &(efip->efi_format.efi_extents[i]); + error = xfs_free_extent(tp, extp->ext_start, extp->ext_len); + if (error) + goto abort_error; + xfs_trans_log_efd_extent(tp, efdp, extp->ext_start, + extp->ext_len); + } + + set_bit(XFS_EFI_RECOVERED, &efip->efi_flags); + error = xfs_trans_commit(tp, 0); + return error; + +abort_error: + xfs_trans_cancel(tp, XFS_TRANS_ABORT); + return error; +} + +/* + * When this is called, all of the EFIs which did not have + * corresponding EFDs should be in the AIL. What we do now + * is free the extents associated with each one. + * + * Since we process the EFIs in normal transactions, they + * will be removed at some point after the commit. This prevents + * us from just walking down the list processing each one. + * We'll use a flag in the EFI to skip those that we've already + * processed and use the AIL iteration mechanism's generation + * count to try to speed this up at least a bit. + * + * When we start, we know that the EFIs are the only things in + * the AIL. As we process them, however, other items are added + * to the AIL. Since everything added to the AIL must come after + * everything already in the AIL, we stop processing as soon as + * we see something other than an EFI in the AIL. + */ +STATIC int +xlog_recover_process_efis( + struct xlog *log) +{ + xfs_log_item_t *lip; + xfs_efi_log_item_t *efip; + int error = 0; + struct xfs_ail_cursor cur; + struct xfs_ail *ailp; + + ailp = log->l_ailp; + spin_lock(&ailp->xa_lock); + lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); + while (lip != NULL) { + /* + * We're done when we see something other than an EFI. + * There should be no EFIs left in the AIL now. + */ + if (lip->li_type != XFS_LI_EFI) { +#ifdef DEBUG + for (; lip; lip = xfs_trans_ail_cursor_next(ailp, &cur)) + ASSERT(lip->li_type != XFS_LI_EFI); +#endif + break; + } + + /* + * Skip EFIs that we've already processed. + */ + efip = (xfs_efi_log_item_t *)lip; + if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)) { + lip = xfs_trans_ail_cursor_next(ailp, &cur); + continue; + } + + spin_unlock(&ailp->xa_lock); + error = xlog_recover_process_efi(log->l_mp, efip); + spin_lock(&ailp->xa_lock); + if (error) + goto out; + lip = xfs_trans_ail_cursor_next(ailp, &cur); + } +out: + xfs_trans_ail_cursor_done(&cur); + spin_unlock(&ailp->xa_lock); + return error; +} + +/* + * This routine performs a transaction to null out a bad inode pointer + * in an agi unlinked inode hash bucket. + */ +STATIC void +xlog_recover_clear_agi_bucket( + xfs_mount_t *mp, + xfs_agnumber_t agno, + int bucket) +{ + xfs_trans_t *tp; + xfs_agi_t *agi; + xfs_buf_t *agibp; + int offset; + int error; + + tp = xfs_trans_alloc(mp, XFS_TRANS_CLEAR_AGI_BUCKET); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_clearagi, 0, 0); + if (error) + goto out_abort; + + error = xfs_read_agi(mp, tp, agno, &agibp); + if (error) + goto out_abort; + + agi = XFS_BUF_TO_AGI(agibp); + agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO); + offset = offsetof(xfs_agi_t, agi_unlinked) + + (sizeof(xfs_agino_t) * bucket); + xfs_trans_log_buf(tp, agibp, offset, + (offset + sizeof(xfs_agino_t) - 1)); + + error = xfs_trans_commit(tp, 0); + if (error) + goto out_error; + return; + +out_abort: + xfs_trans_cancel(tp, XFS_TRANS_ABORT); +out_error: + xfs_warn(mp, "%s: failed to clear agi %d. Continuing.", __func__, agno); + return; +} + +STATIC xfs_agino_t +xlog_recover_process_one_iunlink( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_agino_t agino, + int bucket) +{ + struct xfs_buf *ibp; + struct xfs_dinode *dip; + struct xfs_inode *ip; + xfs_ino_t ino; + int error; + + ino = XFS_AGINO_TO_INO(mp, agno, agino); + error = xfs_iget(mp, NULL, ino, 0, 0, &ip); + if (error) + goto fail; + + /* + * Get the on disk inode to find the next inode in the bucket. + */ + error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &ibp, 0, 0); + if (error) + goto fail_iput; + + ASSERT(ip->i_d.di_nlink == 0); + ASSERT(ip->i_d.di_mode != 0); + + /* setup for the next pass */ + agino = be32_to_cpu(dip->di_next_unlinked); + xfs_buf_relse(ibp); + + /* + * Prevent any DMAPI event from being sent when the reference on + * the inode is dropped. + */ + ip->i_d.di_dmevmask = 0; + + IRELE(ip); + return agino; + + fail_iput: + IRELE(ip); + fail: + /* + * We can't read in the inode this bucket points to, or this inode + * is messed up. Just ditch this bucket of inodes. We will lose + * some inodes and space, but at least we won't hang. + * + * Call xlog_recover_clear_agi_bucket() to perform a transaction to + * clear the inode pointer in the bucket. + */ + xlog_recover_clear_agi_bucket(mp, agno, bucket); + return NULLAGINO; +} + +/* + * xlog_iunlink_recover + * + * This is called during recovery to process any inodes which + * we unlinked but not freed when the system crashed. These + * inodes will be on the lists in the AGI blocks. What we do + * here is scan all the AGIs and fully truncate and free any + * inodes found on the lists. Each inode is removed from the + * lists when it has been fully truncated and is freed. The + * freeing of the inode and its removal from the list must be + * atomic. + */ +STATIC void +xlog_recover_process_iunlinks( + struct xlog *log) +{ + xfs_mount_t *mp; + xfs_agnumber_t agno; + xfs_agi_t *agi; + xfs_buf_t *agibp; + xfs_agino_t agino; + int bucket; + int error; + uint mp_dmevmask; + + mp = log->l_mp; + + /* + * Prevent any DMAPI event from being sent while in this function. + */ + mp_dmevmask = mp->m_dmevmask; + mp->m_dmevmask = 0; + + for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { + /* + * Find the agi for this ag. + */ + error = xfs_read_agi(mp, NULL, agno, &agibp); + if (error) { + /* + * AGI is b0rked. Don't process it. + * + * We should probably mark the filesystem as corrupt + * after we've recovered all the ag's we can.... + */ + continue; + } + /* + * Unlock the buffer so that it can be acquired in the normal + * course of the transaction to truncate and free each inode. + * Because we are not racing with anyone else here for the AGI + * buffer, we don't even need to hold it locked to read the + * initial unlinked bucket entries out of the buffer. We keep + * buffer reference though, so that it stays pinned in memory + * while we need the buffer. + */ + agi = XFS_BUF_TO_AGI(agibp); + xfs_buf_unlock(agibp); + + for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) { + agino = be32_to_cpu(agi->agi_unlinked[bucket]); + while (agino != NULLAGINO) { + agino = xlog_recover_process_one_iunlink(mp, + agno, agino, bucket); + } + } + xfs_buf_rele(agibp); + } + + mp->m_dmevmask = mp_dmevmask; +} + +/* + * Upack the log buffer data and crc check it. If the check fails, issue a + * warning if and only if the CRC in the header is non-zero. This makes the + * check an advisory warning, and the zero CRC check will prevent failure + * warnings from being emitted when upgrading the kernel from one that does not + * add CRCs by default. + * + * When filesystems are CRC enabled, this CRC mismatch becomes a fatal log + * corruption failure + */ +STATIC int +xlog_unpack_data_crc( + struct xlog_rec_header *rhead, + xfs_caddr_t dp, + struct xlog *log) +{ + __le32 crc; + + crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len)); + if (crc != rhead->h_crc) { + if (rhead->h_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) { + xfs_alert(log->l_mp, + "log record CRC mismatch: found 0x%x, expected 0x%x.", + le32_to_cpu(rhead->h_crc), + le32_to_cpu(crc)); + xfs_hex_dump(dp, 32); + } + + /* + * If we've detected a log record corruption, then we can't + * recover past this point. Abort recovery if we are enforcing + * CRC protection by punting an error back up the stack. + */ + if (xfs_sb_version_hascrc(&log->l_mp->m_sb)) + return -EFSCORRUPTED; + } + + return 0; +} + +STATIC int +xlog_unpack_data( + struct xlog_rec_header *rhead, + xfs_caddr_t dp, + struct xlog *log) +{ + int i, j, k; + int error; + + error = xlog_unpack_data_crc(rhead, dp, log); + if (error) + return error; + + for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) && + i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) { + *(__be32 *)dp = *(__be32 *)&rhead->h_cycle_data[i]; + dp += BBSIZE; + } + + if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { + xlog_in_core_2_t *xhdr = (xlog_in_core_2_t *)rhead; + for ( ; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) { + j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); + k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); + *(__be32 *)dp = xhdr[j].hic_xheader.xh_cycle_data[k]; + dp += BBSIZE; + } + } + + return 0; +} + +STATIC int +xlog_valid_rec_header( + struct xlog *log, + struct xlog_rec_header *rhead, + xfs_daddr_t blkno) +{ + int hlen; + + if (unlikely(rhead->h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM))) { + XFS_ERROR_REPORT("xlog_valid_rec_header(1)", + XFS_ERRLEVEL_LOW, log->l_mp); + return -EFSCORRUPTED; + } + if (unlikely( + (!rhead->h_version || + (be32_to_cpu(rhead->h_version) & (~XLOG_VERSION_OKBITS))))) { + xfs_warn(log->l_mp, "%s: unrecognised log version (%d).", + __func__, be32_to_cpu(rhead->h_version)); + return -EIO; + } + + /* LR body must have data or it wouldn't have been written */ + hlen = be32_to_cpu(rhead->h_len); + if (unlikely( hlen <= 0 || hlen > INT_MAX )) { + XFS_ERROR_REPORT("xlog_valid_rec_header(2)", + XFS_ERRLEVEL_LOW, log->l_mp); + return -EFSCORRUPTED; + } + if (unlikely( blkno > log->l_logBBsize || blkno > INT_MAX )) { + XFS_ERROR_REPORT("xlog_valid_rec_header(3)", + XFS_ERRLEVEL_LOW, log->l_mp); + return -EFSCORRUPTED; + } + return 0; +} + +/* + * Read the log from tail to head and process the log records found. + * Handle the two cases where the tail and head are in the same cycle + * and where the active portion of the log wraps around the end of + * the physical log separately. The pass parameter is passed through + * to the routines called to process the data and is not looked at + * here. + */ +STATIC int +xlog_do_recovery_pass( + struct xlog *log, + xfs_daddr_t head_blk, + xfs_daddr_t tail_blk, + int pass) +{ + xlog_rec_header_t *rhead; + xfs_daddr_t blk_no; + xfs_caddr_t offset; + xfs_buf_t *hbp, *dbp; + int error = 0, h_size; + int bblks, split_bblks; + int hblks, split_hblks, wrapped_hblks; + struct hlist_head rhash[XLOG_RHASH_SIZE]; + + ASSERT(head_blk != tail_blk); + + /* + * Read the header of the tail block and get the iclog buffer size from + * h_size. Use this to tell how many sectors make up the log header. + */ + if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { + /* + * When using variable length iclogs, read first sector of + * iclog header and extract the header size from it. Get a + * new hbp that is the correct size. + */ + hbp = xlog_get_bp(log, 1); + if (!hbp) + return -ENOMEM; + + error = xlog_bread(log, tail_blk, 1, hbp, &offset); + if (error) + goto bread_err1; + + rhead = (xlog_rec_header_t *)offset; + error = xlog_valid_rec_header(log, rhead, tail_blk); + if (error) + goto bread_err1; + h_size = be32_to_cpu(rhead->h_size); + if ((be32_to_cpu(rhead->h_version) & XLOG_VERSION_2) && + (h_size > XLOG_HEADER_CYCLE_SIZE)) { + hblks = h_size / XLOG_HEADER_CYCLE_SIZE; + if (h_size % XLOG_HEADER_CYCLE_SIZE) + hblks++; + xlog_put_bp(hbp); + hbp = xlog_get_bp(log, hblks); + } else { + hblks = 1; + } + } else { + ASSERT(log->l_sectBBsize == 1); + hblks = 1; + hbp = xlog_get_bp(log, 1); + h_size = XLOG_BIG_RECORD_BSIZE; + } + + if (!hbp) + return -ENOMEM; + dbp = xlog_get_bp(log, BTOBB(h_size)); + if (!dbp) { + xlog_put_bp(hbp); + return -ENOMEM; + } + + memset(rhash, 0, sizeof(rhash)); + blk_no = tail_blk; + if (tail_blk > head_blk) { + /* + * Perform recovery around the end of the physical log. + * When the head is not on the same cycle number as the tail, + * we can't do a sequential recovery. + */ + while (blk_no < log->l_logBBsize) { + /* + * Check for header wrapping around physical end-of-log + */ + offset = hbp->b_addr; + split_hblks = 0; + wrapped_hblks = 0; + if (blk_no + hblks <= log->l_logBBsize) { + /* Read header in one read */ + error = xlog_bread(log, blk_no, hblks, hbp, + &offset); + if (error) + goto bread_err2; + } else { + /* This LR is split across physical log end */ + if (blk_no != log->l_logBBsize) { + /* some data before physical log end */ + ASSERT(blk_no <= INT_MAX); + split_hblks = log->l_logBBsize - (int)blk_no; + ASSERT(split_hblks > 0); + error = xlog_bread(log, blk_no, + split_hblks, hbp, + &offset); + if (error) + goto bread_err2; + } + + /* + * Note: this black magic still works with + * large sector sizes (non-512) only because: + * - we increased the buffer size originally + * by 1 sector giving us enough extra space + * for the second read; + * - the log start is guaranteed to be sector + * aligned; + * - we read the log end (LR header start) + * _first_, then the log start (LR header end) + * - order is important. + */ + wrapped_hblks = hblks - split_hblks; + error = xlog_bread_offset(log, 0, + wrapped_hblks, hbp, + offset + BBTOB(split_hblks)); + if (error) + goto bread_err2; + } + rhead = (xlog_rec_header_t *)offset; + error = xlog_valid_rec_header(log, rhead, + split_hblks ? blk_no : 0); + if (error) + goto bread_err2; + + bblks = (int)BTOBB(be32_to_cpu(rhead->h_len)); + blk_no += hblks; + + /* Read in data for log record */ + if (blk_no + bblks <= log->l_logBBsize) { + error = xlog_bread(log, blk_no, bblks, dbp, + &offset); + if (error) + goto bread_err2; + } else { + /* This log record is split across the + * physical end of log */ + offset = dbp->b_addr; + split_bblks = 0; + if (blk_no != log->l_logBBsize) { + /* some data is before the physical + * end of log */ + ASSERT(!wrapped_hblks); + ASSERT(blk_no <= INT_MAX); + split_bblks = + log->l_logBBsize - (int)blk_no; + ASSERT(split_bblks > 0); + error = xlog_bread(log, blk_no, + split_bblks, dbp, + &offset); + if (error) + goto bread_err2; + } + + /* + * Note: this black magic still works with + * large sector sizes (non-512) only because: + * - we increased the buffer size originally + * by 1 sector giving us enough extra space + * for the second read; + * - the log start is guaranteed to be sector + * aligned; + * - we read the log end (LR header start) + * _first_, then the log start (LR header end) + * - order is important. + */ + error = xlog_bread_offset(log, 0, + bblks - split_bblks, dbp, + offset + BBTOB(split_bblks)); + if (error) + goto bread_err2; + } + + error = xlog_unpack_data(rhead, offset, log); + if (error) + goto bread_err2; + + error = xlog_recover_process_data(log, rhash, + rhead, offset, pass); + if (error) + goto bread_err2; + blk_no += bblks; + } + + ASSERT(blk_no >= log->l_logBBsize); + blk_no -= log->l_logBBsize; + } + + /* read first part of physical log */ + while (blk_no < head_blk) { + error = xlog_bread(log, blk_no, hblks, hbp, &offset); + if (error) + goto bread_err2; + + rhead = (xlog_rec_header_t *)offset; + error = xlog_valid_rec_header(log, rhead, blk_no); + if (error) + goto bread_err2; + + /* blocks in data section */ + bblks = (int)BTOBB(be32_to_cpu(rhead->h_len)); + error = xlog_bread(log, blk_no+hblks, bblks, dbp, + &offset); + if (error) + goto bread_err2; + + error = xlog_unpack_data(rhead, offset, log); + if (error) + goto bread_err2; + + error = xlog_recover_process_data(log, rhash, + rhead, offset, pass); + if (error) + goto bread_err2; + blk_no += bblks + hblks; + } + + bread_err2: + xlog_put_bp(dbp); + bread_err1: + xlog_put_bp(hbp); + return error; +} + +/* + * Do the recovery of the log. We actually do this in two phases. + * The two passes are necessary in order to implement the function + * of cancelling a record written into the log. The first pass + * determines those things which have been cancelled, and the + * second pass replays log items normally except for those which + * have been cancelled. The handling of the replay and cancellations + * takes place in the log item type specific routines. + * + * The table of items which have cancel records in the log is allocated + * and freed at this level, since only here do we know when all of + * the log recovery has been completed. + */ +STATIC int +xlog_do_log_recovery( + struct xlog *log, + xfs_daddr_t head_blk, + xfs_daddr_t tail_blk) +{ + int error, i; + + ASSERT(head_blk != tail_blk); + + /* + * First do a pass to find all of the cancelled buf log items. + * Store them in the buf_cancel_table for use in the second pass. + */ + log->l_buf_cancel_table = kmem_zalloc(XLOG_BC_TABLE_SIZE * + sizeof(struct list_head), + KM_SLEEP); + for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) + INIT_LIST_HEAD(&log->l_buf_cancel_table[i]); + + error = xlog_do_recovery_pass(log, head_blk, tail_blk, + XLOG_RECOVER_PASS1); + if (error != 0) { + kmem_free(log->l_buf_cancel_table); + log->l_buf_cancel_table = NULL; + return error; + } + /* + * Then do a second pass to actually recover the items in the log. + * When it is complete free the table of buf cancel items. + */ + error = xlog_do_recovery_pass(log, head_blk, tail_blk, + XLOG_RECOVER_PASS2); +#ifdef DEBUG + if (!error) { + int i; + + for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) + ASSERT(list_empty(&log->l_buf_cancel_table[i])); + } +#endif /* DEBUG */ + + kmem_free(log->l_buf_cancel_table); + log->l_buf_cancel_table = NULL; + + return error; +} + +/* + * Do the actual recovery + */ +STATIC int +xlog_do_recover( + struct xlog *log, + xfs_daddr_t head_blk, + xfs_daddr_t tail_blk) +{ + int error; + xfs_buf_t *bp; + xfs_sb_t *sbp; + + /* + * First replay the images in the log. + */ + error = xlog_do_log_recovery(log, head_blk, tail_blk); + if (error) + return error; + + /* + * If IO errors happened during recovery, bail out. + */ + if (XFS_FORCED_SHUTDOWN(log->l_mp)) { + return -EIO; + } + + /* + * We now update the tail_lsn since much of the recovery has completed + * and there may be space available to use. If there were no extent + * or iunlinks, we can free up the entire log and set the tail_lsn to + * be the last_sync_lsn. This was set in xlog_find_tail to be the + * lsn of the last known good LR on disk. If there are extent frees + * or iunlinks they will have some entries in the AIL; so we look at + * the AIL to determine how to set the tail_lsn. + */ + xlog_assign_tail_lsn(log->l_mp); + + /* + * Now that we've finished replaying all buffer and inode + * updates, re-read in the superblock and reverify it. + */ + bp = xfs_getsb(log->l_mp, 0); + XFS_BUF_UNDONE(bp); + ASSERT(!(XFS_BUF_ISWRITE(bp))); + XFS_BUF_READ(bp); + XFS_BUF_UNASYNC(bp); + bp->b_ops = &xfs_sb_buf_ops; + + error = xfs_buf_submit_wait(bp); + if (error) { + if (!XFS_FORCED_SHUTDOWN(log->l_mp)) { + xfs_buf_ioerror_alert(bp, __func__); + ASSERT(0); + } + xfs_buf_relse(bp); + return error; + } + + /* Convert superblock from on-disk format */ + sbp = &log->l_mp->m_sb; + xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp)); + ASSERT(sbp->sb_magicnum == XFS_SB_MAGIC); + ASSERT(xfs_sb_good_version(sbp)); + xfs_reinit_percpu_counters(log->l_mp); + + xfs_buf_relse(bp); + + + xlog_recover_check_summary(log); + + /* Normal transactions can now occur */ + log->l_flags &= ~XLOG_ACTIVE_RECOVERY; + return 0; +} + +/* + * Perform recovery and re-initialize some log variables in xlog_find_tail. + * + * Return error or zero. + */ +int +xlog_recover( + struct xlog *log) +{ + xfs_daddr_t head_blk, tail_blk; + int error; + + /* find the tail of the log */ + if ((error = xlog_find_tail(log, &head_blk, &tail_blk))) + return error; + + if (tail_blk != head_blk) { + /* There used to be a comment here: + * + * disallow recovery on read-only mounts. note -- mount + * checks for ENOSPC and turns it into an intelligent + * error message. + * ...but this is no longer true. Now, unless you specify + * NORECOVERY (in which case this function would never be + * called), we just go ahead and recover. We do this all + * under the vfs layer, so we can get away with it unless + * the device itself is read-only, in which case we fail. + */ + if ((error = xfs_dev_is_read_only(log->l_mp, "recovery"))) { + return error; + } + + /* + * Version 5 superblock log feature mask validation. We know the + * log is dirty so check if there are any unknown log features + * in what we need to recover. If there are unknown features + * (e.g. unsupported transactions, then simply reject the + * attempt at recovery before touching anything. + */ + if (XFS_SB_VERSION_NUM(&log->l_mp->m_sb) == XFS_SB_VERSION_5 && + xfs_sb_has_incompat_log_feature(&log->l_mp->m_sb, + XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN)) { + xfs_warn(log->l_mp, +"Superblock has unknown incompatible log features (0x%x) enabled.\n" +"The log can not be fully and/or safely recovered by this kernel.\n" +"Please recover the log on a kernel that supports the unknown features.", + (log->l_mp->m_sb.sb_features_log_incompat & + XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN)); + return -EINVAL; + } + + /* + * Delay log recovery if the debug hook is set. This is debug + * instrumention to coordinate simulation of I/O failures with + * log recovery. + */ + if (xfs_globals.log_recovery_delay) { + xfs_notice(log->l_mp, + "Delaying log recovery for %d seconds.", + xfs_globals.log_recovery_delay); + msleep(xfs_globals.log_recovery_delay * 1000); + } + + xfs_notice(log->l_mp, "Starting recovery (logdev: %s)", + log->l_mp->m_logname ? log->l_mp->m_logname + : "internal"); + + error = xlog_do_recover(log, head_blk, tail_blk); + log->l_flags |= XLOG_RECOVERY_NEEDED; + } + return error; +} + +/* + * In the first part of recovery we replay inodes and buffers and build + * up the list of extent free items which need to be processed. Here + * we process the extent free items and clean up the on disk unlinked + * inode lists. This is separated from the first part of recovery so + * that the root and real-time bitmap inodes can be read in from disk in + * between the two stages. This is necessary so that we can free space + * in the real-time portion of the file system. + */ +int +xlog_recover_finish( + struct xlog *log) +{ + /* + * Now we're ready to do the transactions needed for the + * rest of recovery. Start with completing all the extent + * free intent records and then process the unlinked inode + * lists. At this point, we essentially run in normal mode + * except that we're still performing recovery actions + * rather than accepting new requests. + */ + if (log->l_flags & XLOG_RECOVERY_NEEDED) { + int error; + error = xlog_recover_process_efis(log); + if (error) { + xfs_alert(log->l_mp, "Failed to recover EFIs"); + return error; + } + /* + * Sync the log to get all the EFIs out of the AIL. + * This isn't absolutely necessary, but it helps in + * case the unlink transactions would have problems + * pushing the EFIs out of the way. + */ + xfs_log_force(log->l_mp, XFS_LOG_SYNC); + + xlog_recover_process_iunlinks(log); + + xlog_recover_check_summary(log); + + xfs_notice(log->l_mp, "Ending recovery (logdev: %s)", + log->l_mp->m_logname ? log->l_mp->m_logname + : "internal"); + log->l_flags &= ~XLOG_RECOVERY_NEEDED; + } else { + xfs_info(log->l_mp, "Ending clean mount"); + } + return 0; +} + + +#if defined(DEBUG) +/* + * Read all of the agf and agi counters and check that they + * are consistent with the superblock counters. + */ +void +xlog_recover_check_summary( + struct xlog *log) +{ + xfs_mount_t *mp; + xfs_agf_t *agfp; + xfs_buf_t *agfbp; + xfs_buf_t *agibp; + xfs_agnumber_t agno; + __uint64_t freeblks; + __uint64_t itotal; + __uint64_t ifree; + int error; + + mp = log->l_mp; + + freeblks = 0LL; + itotal = 0LL; + ifree = 0LL; + for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { + error = xfs_read_agf(mp, NULL, agno, 0, &agfbp); + if (error) { + xfs_alert(mp, "%s agf read failed agno %d error %d", + __func__, agno, error); + } else { + agfp = XFS_BUF_TO_AGF(agfbp); + freeblks += be32_to_cpu(agfp->agf_freeblks) + + be32_to_cpu(agfp->agf_flcount); + xfs_buf_relse(agfbp); + } + + error = xfs_read_agi(mp, NULL, agno, &agibp); + if (error) { + xfs_alert(mp, "%s agi read failed agno %d error %d", + __func__, agno, error); + } else { + struct xfs_agi *agi = XFS_BUF_TO_AGI(agibp); + + itotal += be32_to_cpu(agi->agi_count); + ifree += be32_to_cpu(agi->agi_freecount); + xfs_buf_relse(agibp); + } + } +} +#endif /* DEBUG */ diff --git a/kernel/fs/xfs/xfs_message.c b/kernel/fs/xfs/xfs_message.c new file mode 100644 index 000000000..d8b67547a --- /dev/null +++ b/kernel/fs/xfs/xfs_message.c @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2011 Red Hat, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" + +/* + * XFS logging functions + */ +static void +__xfs_printk( + const char *level, + const struct xfs_mount *mp, + struct va_format *vaf) +{ + if (mp && mp->m_fsname) { + printk("%sXFS (%s): %pV\n", level, mp->m_fsname, vaf); + return; + } + printk("%sXFS: %pV\n", level, vaf); +} + +#define define_xfs_printk_level(func, kern_level) \ +void func(const struct xfs_mount *mp, const char *fmt, ...) \ +{ \ + struct va_format vaf; \ + va_list args; \ + \ + va_start(args, fmt); \ + \ + vaf.fmt = fmt; \ + vaf.va = &args; \ + \ + __xfs_printk(kern_level, mp, &vaf); \ + va_end(args); \ +} \ + +define_xfs_printk_level(xfs_emerg, KERN_EMERG); +define_xfs_printk_level(xfs_alert, KERN_ALERT); +define_xfs_printk_level(xfs_crit, KERN_CRIT); +define_xfs_printk_level(xfs_err, KERN_ERR); +define_xfs_printk_level(xfs_warn, KERN_WARNING); +define_xfs_printk_level(xfs_notice, KERN_NOTICE); +define_xfs_printk_level(xfs_info, KERN_INFO); +#ifdef DEBUG +define_xfs_printk_level(xfs_debug, KERN_DEBUG); +#endif + +void +xfs_alert_tag( + const struct xfs_mount *mp, + int panic_tag, + const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + int do_panic = 0; + + if (xfs_panic_mask && (xfs_panic_mask & panic_tag)) { + xfs_alert(mp, "Transforming an alert into a BUG."); + do_panic = 1; + } + + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + __xfs_printk(KERN_ALERT, mp, &vaf); + va_end(args); + + BUG_ON(do_panic); +} + +void +asswarn(char *expr, char *file, int line) +{ + xfs_warn(NULL, "Assertion failed: %s, file: %s, line: %d", + expr, file, line); + WARN_ON(1); +} + +void +assfail(char *expr, char *file, int line) +{ + xfs_emerg(NULL, "Assertion failed: %s, file: %s, line: %d", + expr, file, line); + BUG(); +} + +void +xfs_hex_dump(void *p, int length) +{ + print_hex_dump(KERN_ALERT, "", DUMP_PREFIX_ADDRESS, 16, 1, p, length, 1); +} diff --git a/kernel/fs/xfs/xfs_message.h b/kernel/fs/xfs/xfs_message.h new file mode 100644 index 000000000..854011557 --- /dev/null +++ b/kernel/fs/xfs/xfs_message.h @@ -0,0 +1,64 @@ +#ifndef __XFS_MESSAGE_H +#define __XFS_MESSAGE_H 1 + +struct xfs_mount; + +extern __printf(2, 3) +void xfs_emerg(const struct xfs_mount *mp, const char *fmt, ...); +extern __printf(2, 3) +void xfs_alert(const struct xfs_mount *mp, const char *fmt, ...); +extern __printf(3, 4) +void xfs_alert_tag(const struct xfs_mount *mp, int tag, const char *fmt, ...); +extern __printf(2, 3) +void xfs_crit(const struct xfs_mount *mp, const char *fmt, ...); +extern __printf(2, 3) +void xfs_err(const struct xfs_mount *mp, const char *fmt, ...); +extern __printf(2, 3) +void xfs_warn(const struct xfs_mount *mp, const char *fmt, ...); +extern __printf(2, 3) +void xfs_notice(const struct xfs_mount *mp, const char *fmt, ...); +extern __printf(2, 3) +void xfs_info(const struct xfs_mount *mp, const char *fmt, ...); + +#ifdef DEBUG +extern __printf(2, 3) +void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...); +#else +static inline __printf(2, 3) +void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...) +{ +} +#endif + +#define xfs_printk_ratelimited(func, dev, fmt, ...) \ +do { \ + static DEFINE_RATELIMIT_STATE(_rs, \ + DEFAULT_RATELIMIT_INTERVAL, \ + DEFAULT_RATELIMIT_BURST); \ + if (__ratelimit(&_rs)) \ + func(dev, fmt, ##__VA_ARGS__); \ +} while (0) + +#define xfs_emerg_ratelimited(dev, fmt, ...) \ + xfs_printk_ratelimited(xfs_emerg, dev, fmt, ##__VA_ARGS__) +#define xfs_alert_ratelimited(dev, fmt, ...) \ + xfs_printk_ratelimited(xfs_alert, dev, fmt, ##__VA_ARGS__) +#define xfs_crit_ratelimited(dev, fmt, ...) \ + xfs_printk_ratelimited(xfs_crit, dev, fmt, ##__VA_ARGS__) +#define xfs_err_ratelimited(dev, fmt, ...) \ + xfs_printk_ratelimited(xfs_err, dev, fmt, ##__VA_ARGS__) +#define xfs_warn_ratelimited(dev, fmt, ...) \ + xfs_printk_ratelimited(xfs_warn, dev, fmt, ##__VA_ARGS__) +#define xfs_notice_ratelimited(dev, fmt, ...) \ + xfs_printk_ratelimited(xfs_notice, dev, fmt, ##__VA_ARGS__) +#define xfs_info_ratelimited(dev, fmt, ...) \ + xfs_printk_ratelimited(xfs_info, dev, fmt, ##__VA_ARGS__) +#define xfs_debug_ratelimited(dev, fmt, ...) \ + xfs_printk_ratelimited(xfs_debug, dev, fmt, ##__VA_ARGS__) + +extern void assfail(char *expr, char *f, int l); +extern void asswarn(char *expr, char *f, int l); + +extern void xfs_hex_dump(void *p, int length); + +#endif /* __XFS_MESSAGE_H */ diff --git a/kernel/fs/xfs/xfs_mount.c b/kernel/fs/xfs/xfs_mount.c new file mode 100644 index 000000000..6f23fbdfb --- /dev/null +++ b/kernel/fs/xfs/xfs_mount.c @@ -0,0 +1,1283 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_inode.h" +#include "xfs_dir2.h" +#include "xfs_ialloc.h" +#include "xfs_alloc.h" +#include "xfs_rtalloc.h" +#include "xfs_bmap.h" +#include "xfs_trans.h" +#include "xfs_trans_priv.h" +#include "xfs_log.h" +#include "xfs_error.h" +#include "xfs_quota.h" +#include "xfs_fsops.h" +#include "xfs_trace.h" +#include "xfs_icache.h" +#include "xfs_sysfs.h" + + +static DEFINE_MUTEX(xfs_uuid_table_mutex); +static int xfs_uuid_table_size; +static uuid_t *xfs_uuid_table; + +/* + * See if the UUID is unique among mounted XFS filesystems. + * Mount fails if UUID is nil or a FS with the same UUID is already mounted. + */ +STATIC int +xfs_uuid_mount( + struct xfs_mount *mp) +{ + uuid_t *uuid = &mp->m_sb.sb_uuid; + int hole, i; + + if (mp->m_flags & XFS_MOUNT_NOUUID) + return 0; + + if (uuid_is_nil(uuid)) { + xfs_warn(mp, "Filesystem has nil UUID - can't mount"); + return -EINVAL; + } + + mutex_lock(&xfs_uuid_table_mutex); + for (i = 0, hole = -1; i < xfs_uuid_table_size; i++) { + if (uuid_is_nil(&xfs_uuid_table[i])) { + hole = i; + continue; + } + if (uuid_equal(uuid, &xfs_uuid_table[i])) + goto out_duplicate; + } + + if (hole < 0) { + xfs_uuid_table = kmem_realloc(xfs_uuid_table, + (xfs_uuid_table_size + 1) * sizeof(*xfs_uuid_table), + xfs_uuid_table_size * sizeof(*xfs_uuid_table), + KM_SLEEP); + hole = xfs_uuid_table_size++; + } + xfs_uuid_table[hole] = *uuid; + mutex_unlock(&xfs_uuid_table_mutex); + + return 0; + + out_duplicate: + mutex_unlock(&xfs_uuid_table_mutex); + xfs_warn(mp, "Filesystem has duplicate UUID %pU - can't mount", uuid); + return -EINVAL; +} + +STATIC void +xfs_uuid_unmount( + struct xfs_mount *mp) +{ + uuid_t *uuid = &mp->m_sb.sb_uuid; + int i; + + if (mp->m_flags & XFS_MOUNT_NOUUID) + return; + + mutex_lock(&xfs_uuid_table_mutex); + for (i = 0; i < xfs_uuid_table_size; i++) { + if (uuid_is_nil(&xfs_uuid_table[i])) + continue; + if (!uuid_equal(uuid, &xfs_uuid_table[i])) + continue; + memset(&xfs_uuid_table[i], 0, sizeof(uuid_t)); + break; + } + ASSERT(i < xfs_uuid_table_size); + mutex_unlock(&xfs_uuid_table_mutex); +} + + +STATIC void +__xfs_free_perag( + struct rcu_head *head) +{ + struct xfs_perag *pag = container_of(head, struct xfs_perag, rcu_head); + + ASSERT(atomic_read(&pag->pag_ref) == 0); + kmem_free(pag); +} + +/* + * Free up the per-ag resources associated with the mount structure. + */ +STATIC void +xfs_free_perag( + xfs_mount_t *mp) +{ + xfs_agnumber_t agno; + struct xfs_perag *pag; + + for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { + spin_lock(&mp->m_perag_lock); + pag = radix_tree_delete(&mp->m_perag_tree, agno); + spin_unlock(&mp->m_perag_lock); + ASSERT(pag); + ASSERT(atomic_read(&pag->pag_ref) == 0); + call_rcu(&pag->rcu_head, __xfs_free_perag); + } +} + +/* + * Check size of device based on the (data/realtime) block count. + * Note: this check is used by the growfs code as well as mount. + */ +int +xfs_sb_validate_fsb_count( + xfs_sb_t *sbp, + __uint64_t nblocks) +{ + ASSERT(PAGE_SHIFT >= sbp->sb_blocklog); + ASSERT(sbp->sb_blocklog >= BBSHIFT); + + /* Limited by ULONG_MAX of page cache index */ + if (nblocks >> (PAGE_CACHE_SHIFT - sbp->sb_blocklog) > ULONG_MAX) + return -EFBIG; + return 0; +} + +int +xfs_initialize_perag( + xfs_mount_t *mp, + xfs_agnumber_t agcount, + xfs_agnumber_t *maxagi) +{ + xfs_agnumber_t index; + xfs_agnumber_t first_initialised = 0; + xfs_perag_t *pag; + xfs_agino_t agino; + xfs_ino_t ino; + xfs_sb_t *sbp = &mp->m_sb; + int error = -ENOMEM; + + /* + * Walk the current per-ag tree so we don't try to initialise AGs + * that already exist (growfs case). Allocate and insert all the + * AGs we don't find ready for initialisation. + */ + for (index = 0; index < agcount; index++) { + pag = xfs_perag_get(mp, index); + if (pag) { + xfs_perag_put(pag); + continue; + } + if (!first_initialised) + first_initialised = index; + + pag = kmem_zalloc(sizeof(*pag), KM_MAYFAIL); + if (!pag) + goto out_unwind; + pag->pag_agno = index; + pag->pag_mount = mp; + spin_lock_init(&pag->pag_ici_lock); + mutex_init(&pag->pag_ici_reclaim_lock); + INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC); + spin_lock_init(&pag->pag_buf_lock); + pag->pag_buf_tree = RB_ROOT; + + if (radix_tree_preload(GFP_NOFS)) + goto out_unwind; + + spin_lock(&mp->m_perag_lock); + if (radix_tree_insert(&mp->m_perag_tree, index, pag)) { + BUG(); + spin_unlock(&mp->m_perag_lock); + radix_tree_preload_end(); + error = -EEXIST; + goto out_unwind; + } + spin_unlock(&mp->m_perag_lock); + radix_tree_preload_end(); + } + + /* + * If we mount with the inode64 option, or no inode overflows + * the legacy 32-bit address space clear the inode32 option. + */ + agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0); + ino = XFS_AGINO_TO_INO(mp, agcount - 1, agino); + + if ((mp->m_flags & XFS_MOUNT_SMALL_INUMS) && ino > XFS_MAXINUMBER_32) + mp->m_flags |= XFS_MOUNT_32BITINODES; + else + mp->m_flags &= ~XFS_MOUNT_32BITINODES; + + if (mp->m_flags & XFS_MOUNT_32BITINODES) + index = xfs_set_inode32(mp, agcount); + else + index = xfs_set_inode64(mp, agcount); + + if (maxagi) + *maxagi = index; + return 0; + +out_unwind: + kmem_free(pag); + for (; index > first_initialised; index--) { + pag = radix_tree_delete(&mp->m_perag_tree, index); + kmem_free(pag); + } + return error; +} + +/* + * xfs_readsb + * + * Does the initial read of the superblock. + */ +int +xfs_readsb( + struct xfs_mount *mp, + int flags) +{ + unsigned int sector_size; + struct xfs_buf *bp; + struct xfs_sb *sbp = &mp->m_sb; + int error; + int loud = !(flags & XFS_MFSI_QUIET); + const struct xfs_buf_ops *buf_ops; + + ASSERT(mp->m_sb_bp == NULL); + ASSERT(mp->m_ddev_targp != NULL); + + /* + * For the initial read, we must guess at the sector + * size based on the block device. It's enough to + * get the sb_sectsize out of the superblock and + * then reread with the proper length. + * We don't verify it yet, because it may not be complete. + */ + sector_size = xfs_getsize_buftarg(mp->m_ddev_targp); + buf_ops = NULL; + + /* + * Allocate a (locked) buffer to hold the superblock. + * This will be kept around at all times to optimize + * access to the superblock. + */ +reread: + error = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR, + BTOBB(sector_size), 0, &bp, buf_ops); + if (error) { + if (loud) + xfs_warn(mp, "SB validate failed with error %d.", error); + /* bad CRC means corrupted metadata */ + if (error == -EFSBADCRC) + error = -EFSCORRUPTED; + return error; + } + + /* + * Initialize the mount structure from the superblock. + */ + xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp)); + + /* + * If we haven't validated the superblock, do so now before we try + * to check the sector size and reread the superblock appropriately. + */ + if (sbp->sb_magicnum != XFS_SB_MAGIC) { + if (loud) + xfs_warn(mp, "Invalid superblock magic number"); + error = -EINVAL; + goto release_buf; + } + + /* + * We must be able to do sector-sized and sector-aligned IO. + */ + if (sector_size > sbp->sb_sectsize) { + if (loud) + xfs_warn(mp, "device supports %u byte sectors (not %u)", + sector_size, sbp->sb_sectsize); + error = -ENOSYS; + goto release_buf; + } + + if (buf_ops == NULL) { + /* + * Re-read the superblock so the buffer is correctly sized, + * and properly verified. + */ + xfs_buf_relse(bp); + sector_size = sbp->sb_sectsize; + buf_ops = loud ? &xfs_sb_buf_ops : &xfs_sb_quiet_buf_ops; + goto reread; + } + + xfs_reinit_percpu_counters(mp); + + /* no need to be quiet anymore, so reset the buf ops */ + bp->b_ops = &xfs_sb_buf_ops; + + mp->m_sb_bp = bp; + xfs_buf_unlock(bp); + return 0; + +release_buf: + xfs_buf_relse(bp); + return error; +} + +/* + * Update alignment values based on mount options and sb values + */ +STATIC int +xfs_update_alignment(xfs_mount_t *mp) +{ + xfs_sb_t *sbp = &(mp->m_sb); + + if (mp->m_dalign) { + /* + * If stripe unit and stripe width are not multiples + * of the fs blocksize turn off alignment. + */ + if ((BBTOB(mp->m_dalign) & mp->m_blockmask) || + (BBTOB(mp->m_swidth) & mp->m_blockmask)) { + xfs_warn(mp, + "alignment check failed: sunit/swidth vs. blocksize(%d)", + sbp->sb_blocksize); + return -EINVAL; + } else { + /* + * Convert the stripe unit and width to FSBs. + */ + mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign); + if (mp->m_dalign && (sbp->sb_agblocks % mp->m_dalign)) { + xfs_warn(mp, + "alignment check failed: sunit/swidth vs. agsize(%d)", + sbp->sb_agblocks); + return -EINVAL; + } else if (mp->m_dalign) { + mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth); + } else { + xfs_warn(mp, + "alignment check failed: sunit(%d) less than bsize(%d)", + mp->m_dalign, sbp->sb_blocksize); + return -EINVAL; + } + } + + /* + * Update superblock with new values + * and log changes + */ + if (xfs_sb_version_hasdalign(sbp)) { + if (sbp->sb_unit != mp->m_dalign) { + sbp->sb_unit = mp->m_dalign; + mp->m_update_sb = true; + } + if (sbp->sb_width != mp->m_swidth) { + sbp->sb_width = mp->m_swidth; + mp->m_update_sb = true; + } + } else { + xfs_warn(mp, + "cannot change alignment: superblock does not support data alignment"); + return -EINVAL; + } + } else if ((mp->m_flags & XFS_MOUNT_NOALIGN) != XFS_MOUNT_NOALIGN && + xfs_sb_version_hasdalign(&mp->m_sb)) { + mp->m_dalign = sbp->sb_unit; + mp->m_swidth = sbp->sb_width; + } + + return 0; +} + +/* + * Set the maximum inode count for this filesystem + */ +STATIC void +xfs_set_maxicount(xfs_mount_t *mp) +{ + xfs_sb_t *sbp = &(mp->m_sb); + __uint64_t icount; + + if (sbp->sb_imax_pct) { + /* + * Make sure the maximum inode count is a multiple + * of the units we allocate inodes in. + */ + icount = sbp->sb_dblocks * sbp->sb_imax_pct; + do_div(icount, 100); + do_div(icount, mp->m_ialloc_blks); + mp->m_maxicount = (icount * mp->m_ialloc_blks) << + sbp->sb_inopblog; + } else { + mp->m_maxicount = 0; + } +} + +/* + * Set the default minimum read and write sizes unless + * already specified in a mount option. + * We use smaller I/O sizes when the file system + * is being used for NFS service (wsync mount option). + */ +STATIC void +xfs_set_rw_sizes(xfs_mount_t *mp) +{ + xfs_sb_t *sbp = &(mp->m_sb); + int readio_log, writeio_log; + + if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)) { + if (mp->m_flags & XFS_MOUNT_WSYNC) { + readio_log = XFS_WSYNC_READIO_LOG; + writeio_log = XFS_WSYNC_WRITEIO_LOG; + } else { + readio_log = XFS_READIO_LOG_LARGE; + writeio_log = XFS_WRITEIO_LOG_LARGE; + } + } else { + readio_log = mp->m_readio_log; + writeio_log = mp->m_writeio_log; + } + + if (sbp->sb_blocklog > readio_log) { + mp->m_readio_log = sbp->sb_blocklog; + } else { + mp->m_readio_log = readio_log; + } + mp->m_readio_blocks = 1 << (mp->m_readio_log - sbp->sb_blocklog); + if (sbp->sb_blocklog > writeio_log) { + mp->m_writeio_log = sbp->sb_blocklog; + } else { + mp->m_writeio_log = writeio_log; + } + mp->m_writeio_blocks = 1 << (mp->m_writeio_log - sbp->sb_blocklog); +} + +/* + * precalculate the low space thresholds for dynamic speculative preallocation. + */ +void +xfs_set_low_space_thresholds( + struct xfs_mount *mp) +{ + int i; + + for (i = 0; i < XFS_LOWSP_MAX; i++) { + __uint64_t space = mp->m_sb.sb_dblocks; + + do_div(space, 100); + mp->m_low_space[i] = space * (i + 1); + } +} + + +/* + * Set whether we're using inode alignment. + */ +STATIC void +xfs_set_inoalignment(xfs_mount_t *mp) +{ + if (xfs_sb_version_hasalign(&mp->m_sb) && + mp->m_sb.sb_inoalignmt >= + XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size)) + mp->m_inoalign_mask = mp->m_sb.sb_inoalignmt - 1; + else + mp->m_inoalign_mask = 0; + /* + * If we are using stripe alignment, check whether + * the stripe unit is a multiple of the inode alignment + */ + if (mp->m_dalign && mp->m_inoalign_mask && + !(mp->m_dalign & mp->m_inoalign_mask)) + mp->m_sinoalign = mp->m_dalign; + else + mp->m_sinoalign = 0; +} + +/* + * Check that the data (and log if separate) is an ok size. + */ +STATIC int +xfs_check_sizes( + struct xfs_mount *mp) +{ + struct xfs_buf *bp; + xfs_daddr_t d; + int error; + + d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks); + if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) { + xfs_warn(mp, "filesystem size mismatch detected"); + return -EFBIG; + } + error = xfs_buf_read_uncached(mp->m_ddev_targp, + d - XFS_FSS_TO_BB(mp, 1), + XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL); + if (error) { + xfs_warn(mp, "last sector read failed"); + return error; + } + xfs_buf_relse(bp); + + if (mp->m_logdev_targp == mp->m_ddev_targp) + return 0; + + d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks); + if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) { + xfs_warn(mp, "log size mismatch detected"); + return -EFBIG; + } + error = xfs_buf_read_uncached(mp->m_logdev_targp, + d - XFS_FSB_TO_BB(mp, 1), + XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL); + if (error) { + xfs_warn(mp, "log device read failed"); + return error; + } + xfs_buf_relse(bp); + return 0; +} + +/* + * Clear the quotaflags in memory and in the superblock. + */ +int +xfs_mount_reset_sbqflags( + struct xfs_mount *mp) +{ + mp->m_qflags = 0; + + /* It is OK to look at sb_qflags in the mount path without m_sb_lock. */ + if (mp->m_sb.sb_qflags == 0) + return 0; + spin_lock(&mp->m_sb_lock); + mp->m_sb.sb_qflags = 0; + spin_unlock(&mp->m_sb_lock); + + if (!xfs_fs_writable(mp, SB_FREEZE_WRITE)) + return 0; + + return xfs_sync_sb(mp, false); +} + +__uint64_t +xfs_default_resblks(xfs_mount_t *mp) +{ + __uint64_t resblks; + + /* + * We default to 5% or 8192 fsbs of space reserved, whichever is + * smaller. This is intended to cover concurrent allocation + * transactions when we initially hit enospc. These each require a 4 + * block reservation. Hence by default we cover roughly 2000 concurrent + * allocation reservations. + */ + resblks = mp->m_sb.sb_dblocks; + do_div(resblks, 20); + resblks = min_t(__uint64_t, resblks, 8192); + return resblks; +} + +/* + * This function does the following on an initial mount of a file system: + * - reads the superblock from disk and init the mount struct + * - if we're a 32-bit kernel, do a size check on the superblock + * so we don't mount terabyte filesystems + * - init mount struct realtime fields + * - allocate inode hash table for fs + * - init directory manager + * - perform recovery and init the log manager + */ +int +xfs_mountfs( + xfs_mount_t *mp) +{ + xfs_sb_t *sbp = &(mp->m_sb); + xfs_inode_t *rip; + __uint64_t resblks; + uint quotamount = 0; + uint quotaflags = 0; + int error = 0; + + xfs_sb_mount_common(mp, sbp); + + /* + * Check for a mismatched features2 values. Older kernels read & wrote + * into the wrong sb offset for sb_features2 on some platforms due to + * xfs_sb_t not being 64bit size aligned when sb_features2 was added, + * which made older superblock reading/writing routines swap it as a + * 64-bit value. + * + * For backwards compatibility, we make both slots equal. + * + * If we detect a mismatched field, we OR the set bits into the existing + * features2 field in case it has already been modified; we don't want + * to lose any features. We then update the bad location with the ORed + * value so that older kernels will see any features2 flags. The + * superblock writeback code ensures the new sb_features2 is copied to + * sb_bad_features2 before it is logged or written to disk. + */ + if (xfs_sb_has_mismatched_features2(sbp)) { + xfs_warn(mp, "correcting sb_features alignment problem"); + sbp->sb_features2 |= sbp->sb_bad_features2; + mp->m_update_sb = true; + + /* + * Re-check for ATTR2 in case it was found in bad_features2 + * slot. + */ + if (xfs_sb_version_hasattr2(&mp->m_sb) && + !(mp->m_flags & XFS_MOUNT_NOATTR2)) + mp->m_flags |= XFS_MOUNT_ATTR2; + } + + if (xfs_sb_version_hasattr2(&mp->m_sb) && + (mp->m_flags & XFS_MOUNT_NOATTR2)) { + xfs_sb_version_removeattr2(&mp->m_sb); + mp->m_update_sb = true; + + /* update sb_versionnum for the clearing of the morebits */ + if (!sbp->sb_features2) + mp->m_update_sb = true; + } + + /* always use v2 inodes by default now */ + if (!(mp->m_sb.sb_versionnum & XFS_SB_VERSION_NLINKBIT)) { + mp->m_sb.sb_versionnum |= XFS_SB_VERSION_NLINKBIT; + mp->m_update_sb = true; + } + + /* + * Check if sb_agblocks is aligned at stripe boundary + * If sb_agblocks is NOT aligned turn off m_dalign since + * allocator alignment is within an ag, therefore ag has + * to be aligned at stripe boundary. + */ + error = xfs_update_alignment(mp); + if (error) + goto out; + + xfs_alloc_compute_maxlevels(mp); + xfs_bmap_compute_maxlevels(mp, XFS_DATA_FORK); + xfs_bmap_compute_maxlevels(mp, XFS_ATTR_FORK); + xfs_ialloc_compute_maxlevels(mp); + + xfs_set_maxicount(mp); + + error = xfs_sysfs_init(&mp->m_kobj, &xfs_mp_ktype, NULL, mp->m_fsname); + if (error) + goto out; + + error = xfs_uuid_mount(mp); + if (error) + goto out_remove_sysfs; + + /* + * Set the minimum read and write sizes + */ + xfs_set_rw_sizes(mp); + + /* set the low space thresholds for dynamic preallocation */ + xfs_set_low_space_thresholds(mp); + + /* + * Set the inode cluster size. + * This may still be overridden by the file system + * block size if it is larger than the chosen cluster size. + * + * For v5 filesystems, scale the cluster size with the inode size to + * keep a constant ratio of inode per cluster buffer, but only if mkfs + * has set the inode alignment value appropriately for larger cluster + * sizes. + */ + mp->m_inode_cluster_size = XFS_INODE_BIG_CLUSTER_SIZE; + if (xfs_sb_version_hascrc(&mp->m_sb)) { + int new_size = mp->m_inode_cluster_size; + + new_size *= mp->m_sb.sb_inodesize / XFS_DINODE_MIN_SIZE; + if (mp->m_sb.sb_inoalignmt >= XFS_B_TO_FSBT(mp, new_size)) + mp->m_inode_cluster_size = new_size; + } + + /* + * Set inode alignment fields + */ + xfs_set_inoalignment(mp); + + /* + * Check that the data (and log if separate) is an ok size. + */ + error = xfs_check_sizes(mp); + if (error) + goto out_remove_uuid; + + /* + * Initialize realtime fields in the mount structure + */ + error = xfs_rtmount_init(mp); + if (error) { + xfs_warn(mp, "RT mount failed"); + goto out_remove_uuid; + } + + /* + * Copies the low order bits of the timestamp and the randomly + * set "sequence" number out of a UUID. + */ + uuid_getnodeuniq(&sbp->sb_uuid, mp->m_fixedfsid); + + mp->m_dmevmask = 0; /* not persistent; set after each mount */ + + error = xfs_da_mount(mp); + if (error) { + xfs_warn(mp, "Failed dir/attr init: %d", error); + goto out_remove_uuid; + } + + /* + * Initialize the precomputed transaction reservations values. + */ + xfs_trans_init(mp); + + /* + * Allocate and initialize the per-ag data. + */ + spin_lock_init(&mp->m_perag_lock); + INIT_RADIX_TREE(&mp->m_perag_tree, GFP_ATOMIC); + error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi); + if (error) { + xfs_warn(mp, "Failed per-ag init: %d", error); + goto out_free_dir; + } + + if (!sbp->sb_logblocks) { + xfs_warn(mp, "no log defined"); + XFS_ERROR_REPORT("xfs_mountfs", XFS_ERRLEVEL_LOW, mp); + error = -EFSCORRUPTED; + goto out_free_perag; + } + + /* + * log's mount-time initialization. Perform 1st part recovery if needed + */ + error = xfs_log_mount(mp, mp->m_logdev_targp, + XFS_FSB_TO_DADDR(mp, sbp->sb_logstart), + XFS_FSB_TO_BB(mp, sbp->sb_logblocks)); + if (error) { + xfs_warn(mp, "log mount failed"); + goto out_fail_wait; + } + + /* + * Now the log is mounted, we know if it was an unclean shutdown or + * not. If it was, with the first phase of recovery has completed, we + * have consistent AG blocks on disk. We have not recovered EFIs yet, + * but they are recovered transactionally in the second recovery phase + * later. + * + * Hence we can safely re-initialise incore superblock counters from + * the per-ag data. These may not be correct if the filesystem was not + * cleanly unmounted, so we need to wait for recovery to finish before + * doing this. + * + * If the filesystem was cleanly unmounted, then we can trust the + * values in the superblock to be correct and we don't need to do + * anything here. + * + * If we are currently making the filesystem, the initialisation will + * fail as the perag data is in an undefined state. + */ + if (xfs_sb_version_haslazysbcount(&mp->m_sb) && + !XFS_LAST_UNMOUNT_WAS_CLEAN(mp) && + !mp->m_sb.sb_inprogress) { + error = xfs_initialize_perag_data(mp, sbp->sb_agcount); + if (error) + goto out_log_dealloc; + } + + /* + * Get and sanity-check the root inode. + * Save the pointer to it in the mount structure. + */ + error = xfs_iget(mp, NULL, sbp->sb_rootino, 0, XFS_ILOCK_EXCL, &rip); + if (error) { + xfs_warn(mp, "failed to read root inode"); + goto out_log_dealloc; + } + + ASSERT(rip != NULL); + + if (unlikely(!S_ISDIR(rip->i_d.di_mode))) { + xfs_warn(mp, "corrupted root inode %llu: not a directory", + (unsigned long long)rip->i_ino); + xfs_iunlock(rip, XFS_ILOCK_EXCL); + XFS_ERROR_REPORT("xfs_mountfs_int(2)", XFS_ERRLEVEL_LOW, + mp); + error = -EFSCORRUPTED; + goto out_rele_rip; + } + mp->m_rootip = rip; /* save it */ + + xfs_iunlock(rip, XFS_ILOCK_EXCL); + + /* + * Initialize realtime inode pointers in the mount structure + */ + error = xfs_rtmount_inodes(mp); + if (error) { + /* + * Free up the root inode. + */ + xfs_warn(mp, "failed to read RT inodes"); + goto out_rele_rip; + } + + /* + * If this is a read-only mount defer the superblock updates until + * the next remount into writeable mode. Otherwise we would never + * perform the update e.g. for the root filesystem. + */ + if (mp->m_update_sb && !(mp->m_flags & XFS_MOUNT_RDONLY)) { + error = xfs_sync_sb(mp, false); + if (error) { + xfs_warn(mp, "failed to write sb changes"); + goto out_rtunmount; + } + } + + /* + * Initialise the XFS quota management subsystem for this mount + */ + if (XFS_IS_QUOTA_RUNNING(mp)) { + error = xfs_qm_newmount(mp, "amount, "aflags); + if (error) + goto out_rtunmount; + } else { + ASSERT(!XFS_IS_QUOTA_ON(mp)); + + /* + * If a file system had quotas running earlier, but decided to + * mount without -o uquota/pquota/gquota options, revoke the + * quotachecked license. + */ + if (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_ACCT) { + xfs_notice(mp, "resetting quota flags"); + error = xfs_mount_reset_sbqflags(mp); + if (error) + goto out_rtunmount; + } + } + + /* + * Finish recovering the file system. This part needed to be + * delayed until after the root and real-time bitmap inodes + * were consistently read in. + */ + error = xfs_log_mount_finish(mp); + if (error) { + xfs_warn(mp, "log mount finish failed"); + goto out_rtunmount; + } + + /* + * Complete the quota initialisation, post-log-replay component. + */ + if (quotamount) { + ASSERT(mp->m_qflags == 0); + mp->m_qflags = quotaflags; + + xfs_qm_mount_quotas(mp); + } + + /* + * Now we are mounted, reserve a small amount of unused space for + * privileged transactions. This is needed so that transaction + * space required for critical operations can dip into this pool + * when at ENOSPC. This is needed for operations like create with + * attr, unwritten extent conversion at ENOSPC, etc. Data allocations + * are not allowed to use this reserved space. + * + * This may drive us straight to ENOSPC on mount, but that implies + * we were already there on the last unmount. Warn if this occurs. + */ + if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { + resblks = xfs_default_resblks(mp); + error = xfs_reserve_blocks(mp, &resblks, NULL); + if (error) + xfs_warn(mp, + "Unable to allocate reserve blocks. Continuing without reserve pool."); + } + + return 0; + + out_rtunmount: + xfs_rtunmount_inodes(mp); + out_rele_rip: + IRELE(rip); + out_log_dealloc: + xfs_log_unmount(mp); + out_fail_wait: + if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) + xfs_wait_buftarg(mp->m_logdev_targp); + xfs_wait_buftarg(mp->m_ddev_targp); + out_free_perag: + xfs_free_perag(mp); + out_free_dir: + xfs_da_unmount(mp); + out_remove_uuid: + xfs_uuid_unmount(mp); + out_remove_sysfs: + xfs_sysfs_del(&mp->m_kobj); + out: + return error; +} + +/* + * This flushes out the inodes,dquots and the superblock, unmounts the + * log and makes sure that incore structures are freed. + */ +void +xfs_unmountfs( + struct xfs_mount *mp) +{ + __uint64_t resblks; + int error; + + cancel_delayed_work_sync(&mp->m_eofblocks_work); + + xfs_qm_unmount_quotas(mp); + xfs_rtunmount_inodes(mp); + IRELE(mp->m_rootip); + + /* + * We can potentially deadlock here if we have an inode cluster + * that has been freed has its buffer still pinned in memory because + * the transaction is still sitting in a iclog. The stale inodes + * on that buffer will have their flush locks held until the + * transaction hits the disk and the callbacks run. the inode + * flush takes the flush lock unconditionally and with nothing to + * push out the iclog we will never get that unlocked. hence we + * need to force the log first. + */ + xfs_log_force(mp, XFS_LOG_SYNC); + + /* + * Flush all pending changes from the AIL. + */ + xfs_ail_push_all_sync(mp->m_ail); + + /* + * And reclaim all inodes. At this point there should be no dirty + * inodes and none should be pinned or locked, but use synchronous + * reclaim just to be sure. We can stop background inode reclaim + * here as well if it is still running. + */ + cancel_delayed_work_sync(&mp->m_reclaim_work); + xfs_reclaim_inodes(mp, SYNC_WAIT); + + xfs_qm_unmount(mp); + + /* + * Unreserve any blocks we have so that when we unmount we don't account + * the reserved free space as used. This is really only necessary for + * lazy superblock counting because it trusts the incore superblock + * counters to be absolutely correct on clean unmount. + * + * We don't bother correcting this elsewhere for lazy superblock + * counting because on mount of an unclean filesystem we reconstruct the + * correct counter value and this is irrelevant. + * + * For non-lazy counter filesystems, this doesn't matter at all because + * we only every apply deltas to the superblock and hence the incore + * value does not matter.... + */ + resblks = 0; + error = xfs_reserve_blocks(mp, &resblks, NULL); + if (error) + xfs_warn(mp, "Unable to free reserved block pool. " + "Freespace may not be correct on next mount."); + + error = xfs_log_sbcount(mp); + if (error) + xfs_warn(mp, "Unable to update superblock counters. " + "Freespace may not be correct on next mount."); + + xfs_log_unmount(mp); + xfs_da_unmount(mp); + xfs_uuid_unmount(mp); + +#if defined(DEBUG) + xfs_errortag_clearall(mp, 0); +#endif + xfs_free_perag(mp); + + xfs_sysfs_del(&mp->m_kobj); +} + +/* + * Determine whether modifications can proceed. The caller specifies the minimum + * freeze level for which modifications should not be allowed. This allows + * certain operations to proceed while the freeze sequence is in progress, if + * necessary. + */ +bool +xfs_fs_writable( + struct xfs_mount *mp, + int level) +{ + ASSERT(level > SB_UNFROZEN); + if ((mp->m_super->s_writers.frozen >= level) || + XFS_FORCED_SHUTDOWN(mp) || (mp->m_flags & XFS_MOUNT_RDONLY)) + return false; + + return true; +} + +/* + * xfs_log_sbcount + * + * Sync the superblock counters to disk. + * + * Note this code can be called during the process of freezing, so we use the + * transaction allocator that does not block when the transaction subsystem is + * in its frozen state. + */ +int +xfs_log_sbcount(xfs_mount_t *mp) +{ + /* allow this to proceed during the freeze sequence... */ + if (!xfs_fs_writable(mp, SB_FREEZE_COMPLETE)) + return 0; + + /* + * we don't need to do this if we are updating the superblock + * counters on every modification. + */ + if (!xfs_sb_version_haslazysbcount(&mp->m_sb)) + return 0; + + return xfs_sync_sb(mp, true); +} + +/* + * Deltas for the inode count are +/-64, hence we use a large batch size + * of 128 so we don't need to take the counter lock on every update. + */ +#define XFS_ICOUNT_BATCH 128 +int +xfs_mod_icount( + struct xfs_mount *mp, + int64_t delta) +{ + __percpu_counter_add(&mp->m_icount, delta, XFS_ICOUNT_BATCH); + if (__percpu_counter_compare(&mp->m_icount, 0, XFS_ICOUNT_BATCH) < 0) { + ASSERT(0); + percpu_counter_add(&mp->m_icount, -delta); + return -EINVAL; + } + return 0; +} + +int +xfs_mod_ifree( + struct xfs_mount *mp, + int64_t delta) +{ + percpu_counter_add(&mp->m_ifree, delta); + if (percpu_counter_compare(&mp->m_ifree, 0) < 0) { + ASSERT(0); + percpu_counter_add(&mp->m_ifree, -delta); + return -EINVAL; + } + return 0; +} + +/* + * Deltas for the block count can vary from 1 to very large, but lock contention + * only occurs on frequent small block count updates such as in the delayed + * allocation path for buffered writes (page a time updates). Hence we set + * a large batch count (1024) to minimise global counter updates except when + * we get near to ENOSPC and we have to be very accurate with our updates. + */ +#define XFS_FDBLOCKS_BATCH 1024 +int +xfs_mod_fdblocks( + struct xfs_mount *mp, + int64_t delta, + bool rsvd) +{ + int64_t lcounter; + long long res_used; + s32 batch; + + if (delta > 0) { + /* + * If the reserve pool is depleted, put blocks back into it + * first. Most of the time the pool is full. + */ + if (likely(mp->m_resblks == mp->m_resblks_avail)) { + percpu_counter_add(&mp->m_fdblocks, delta); + return 0; + } + + spin_lock(&mp->m_sb_lock); + res_used = (long long)(mp->m_resblks - mp->m_resblks_avail); + + if (res_used > delta) { + mp->m_resblks_avail += delta; + } else { + delta -= res_used; + mp->m_resblks_avail = mp->m_resblks; + percpu_counter_add(&mp->m_fdblocks, delta); + } + spin_unlock(&mp->m_sb_lock); + return 0; + } + + /* + * Taking blocks away, need to be more accurate the closer we + * are to zero. + * + * If the counter has a value of less than 2 * max batch size, + * then make everything serialise as we are real close to + * ENOSPC. + */ + if (__percpu_counter_compare(&mp->m_fdblocks, 2 * XFS_FDBLOCKS_BATCH, + XFS_FDBLOCKS_BATCH) < 0) + batch = 1; + else + batch = XFS_FDBLOCKS_BATCH; + + __percpu_counter_add(&mp->m_fdblocks, delta, batch); + if (__percpu_counter_compare(&mp->m_fdblocks, XFS_ALLOC_SET_ASIDE(mp), + XFS_FDBLOCKS_BATCH) >= 0) { + /* we had space! */ + return 0; + } + + /* + * lock up the sb for dipping into reserves before releasing the space + * that took us to ENOSPC. + */ + spin_lock(&mp->m_sb_lock); + percpu_counter_add(&mp->m_fdblocks, -delta); + if (!rsvd) + goto fdblocks_enospc; + + lcounter = (long long)mp->m_resblks_avail + delta; + if (lcounter >= 0) { + mp->m_resblks_avail = lcounter; + spin_unlock(&mp->m_sb_lock); + return 0; + } + printk_once(KERN_WARNING + "Filesystem \"%s\": reserve blocks depleted! " + "Consider increasing reserve pool size.", + mp->m_fsname); +fdblocks_enospc: + spin_unlock(&mp->m_sb_lock); + return -ENOSPC; +} + +int +xfs_mod_frextents( + struct xfs_mount *mp, + int64_t delta) +{ + int64_t lcounter; + int ret = 0; + + spin_lock(&mp->m_sb_lock); + lcounter = mp->m_sb.sb_frextents + delta; + if (lcounter < 0) + ret = -ENOSPC; + else + mp->m_sb.sb_frextents = lcounter; + spin_unlock(&mp->m_sb_lock); + return ret; +} + +/* + * xfs_getsb() is called to obtain the buffer for the superblock. + * The buffer is returned locked and read in from disk. + * The buffer should be released with a call to xfs_brelse(). + * + * If the flags parameter is BUF_TRYLOCK, then we'll only return + * the superblock buffer if it can be locked without sleeping. + * If it can't then we'll return NULL. + */ +struct xfs_buf * +xfs_getsb( + struct xfs_mount *mp, + int flags) +{ + struct xfs_buf *bp = mp->m_sb_bp; + + if (!xfs_buf_trylock(bp)) { + if (flags & XBF_TRYLOCK) + return NULL; + xfs_buf_lock(bp); + } + + xfs_buf_hold(bp); + ASSERT(XFS_BUF_ISDONE(bp)); + return bp; +} + +/* + * Used to free the superblock along various error paths. + */ +void +xfs_freesb( + struct xfs_mount *mp) +{ + struct xfs_buf *bp = mp->m_sb_bp; + + xfs_buf_lock(bp); + mp->m_sb_bp = NULL; + xfs_buf_relse(bp); +} + +/* + * If the underlying (data/log/rt) device is readonly, there are some + * operations that cannot proceed. + */ +int +xfs_dev_is_read_only( + struct xfs_mount *mp, + char *message) +{ + if (xfs_readonly_buftarg(mp->m_ddev_targp) || + xfs_readonly_buftarg(mp->m_logdev_targp) || + (mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp))) { + xfs_notice(mp, "%s required on read-only device.", message); + xfs_notice(mp, "write access unavailable, cannot proceed."); + return -EROFS; + } + return 0; +} diff --git a/kernel/fs/xfs/xfs_mount.h b/kernel/fs/xfs/xfs_mount.h new file mode 100644 index 000000000..8c995a2cc --- /dev/null +++ b/kernel/fs/xfs/xfs_mount.h @@ -0,0 +1,335 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_MOUNT_H__ +#define __XFS_MOUNT_H__ + +struct xlog; +struct xfs_inode; +struct xfs_mru_cache; +struct xfs_nameops; +struct xfs_ail; +struct xfs_quotainfo; +struct xfs_dir_ops; +struct xfs_da_geometry; + +/* dynamic preallocation free space thresholds, 5% down to 1% */ +enum { + XFS_LOWSP_1_PCNT = 0, + XFS_LOWSP_2_PCNT, + XFS_LOWSP_3_PCNT, + XFS_LOWSP_4_PCNT, + XFS_LOWSP_5_PCNT, + XFS_LOWSP_MAX, +}; + +typedef struct xfs_mount { + struct super_block *m_super; + xfs_tid_t m_tid; /* next unused tid for fs */ + struct xfs_ail *m_ail; /* fs active log item list */ + + struct xfs_sb m_sb; /* copy of fs superblock */ + spinlock_t m_sb_lock; /* sb counter lock */ + struct percpu_counter m_icount; /* allocated inodes counter */ + struct percpu_counter m_ifree; /* free inodes counter */ + struct percpu_counter m_fdblocks; /* free block counter */ + + struct xfs_buf *m_sb_bp; /* buffer for superblock */ + char *m_fsname; /* filesystem name */ + int m_fsname_len; /* strlen of fs name */ + char *m_rtname; /* realtime device name */ + char *m_logname; /* external log device name */ + int m_bsize; /* fs logical block size */ + xfs_agnumber_t m_agfrotor; /* last ag where space found */ + xfs_agnumber_t m_agirotor; /* last ag dir inode alloced */ + spinlock_t m_agirotor_lock;/* .. and lock protecting it */ + xfs_agnumber_t m_maxagi; /* highest inode alloc group */ + uint m_readio_log; /* min read size log bytes */ + uint m_readio_blocks; /* min read size blocks */ + uint m_writeio_log; /* min write size log bytes */ + uint m_writeio_blocks; /* min write size blocks */ + struct xfs_da_geometry *m_dir_geo; /* directory block geometry */ + struct xfs_da_geometry *m_attr_geo; /* attribute block geometry */ + struct xlog *m_log; /* log specific stuff */ + int m_logbufs; /* number of log buffers */ + int m_logbsize; /* size of each log buffer */ + uint m_rsumlevels; /* rt summary levels */ + uint m_rsumsize; /* size of rt summary, bytes */ + struct xfs_inode *m_rbmip; /* pointer to bitmap inode */ + struct xfs_inode *m_rsumip; /* pointer to summary inode */ + struct xfs_inode *m_rootip; /* pointer to root directory */ + struct xfs_quotainfo *m_quotainfo; /* disk quota information */ + xfs_buftarg_t *m_ddev_targp; /* saves taking the address */ + xfs_buftarg_t *m_logdev_targp;/* ptr to log device */ + xfs_buftarg_t *m_rtdev_targp; /* ptr to rt device */ + __uint8_t m_blkbit_log; /* blocklog + NBBY */ + __uint8_t m_blkbb_log; /* blocklog - BBSHIFT */ + __uint8_t m_agno_log; /* log #ag's */ + __uint8_t m_agino_log; /* #bits for agino in inum */ + uint m_inode_cluster_size;/* min inode buf size */ + uint m_blockmask; /* sb_blocksize-1 */ + uint m_blockwsize; /* sb_blocksize in words */ + uint m_blockwmask; /* blockwsize-1 */ + uint m_alloc_mxr[2]; /* max alloc btree records */ + uint m_alloc_mnr[2]; /* min alloc btree records */ + uint m_bmap_dmxr[2]; /* max bmap btree records */ + uint m_bmap_dmnr[2]; /* min bmap btree records */ + uint m_inobt_mxr[2]; /* max inobt btree records */ + uint m_inobt_mnr[2]; /* min inobt btree records */ + uint m_ag_maxlevels; /* XFS_AG_MAXLEVELS */ + uint m_bm_maxlevels[2]; /* XFS_BM_MAXLEVELS */ + uint m_in_maxlevels; /* max inobt btree levels. */ + struct radix_tree_root m_perag_tree; /* per-ag accounting info */ + spinlock_t m_perag_lock; /* lock for m_perag_tree */ + struct mutex m_growlock; /* growfs mutex */ + int m_fixedfsid[2]; /* unchanged for life of FS */ + uint m_dmevmask; /* DMI events for this FS */ + __uint64_t m_flags; /* global mount flags */ + int m_ialloc_inos; /* inodes in inode allocation */ + int m_ialloc_blks; /* blocks in inode allocation */ + int m_inoalign_mask;/* mask sb_inoalignmt if used */ + uint m_qflags; /* quota status flags */ + struct xfs_trans_resv m_resv; /* precomputed res values */ + __uint64_t m_maxicount; /* maximum inode count */ + __uint64_t m_resblks; /* total reserved blocks */ + __uint64_t m_resblks_avail;/* available reserved blocks */ + __uint64_t m_resblks_save; /* reserved blks @ remount,ro */ + int m_dalign; /* stripe unit */ + int m_swidth; /* stripe width */ + int m_sinoalign; /* stripe unit inode alignment */ + __uint8_t m_sectbb_log; /* sectlog - BBSHIFT */ + const struct xfs_nameops *m_dirnameops; /* vector of dir name ops */ + const struct xfs_dir_ops *m_dir_inode_ops; /* vector of dir inode ops */ + const struct xfs_dir_ops *m_nondir_inode_ops; /* !dir inode ops */ + uint m_chsize; /* size of next field */ + atomic_t m_active_trans; /* number trans frozen */ + struct xfs_mru_cache *m_filestream; /* per-mount filestream data */ + struct delayed_work m_reclaim_work; /* background inode reclaim */ + struct delayed_work m_eofblocks_work; /* background eof blocks + trimming */ + bool m_update_sb; /* sb needs update in mount */ + int64_t m_low_space[XFS_LOWSP_MAX]; + /* low free space thresholds */ + struct xfs_kobj m_kobj; + + struct workqueue_struct *m_buf_workqueue; + struct workqueue_struct *m_data_workqueue; + struct workqueue_struct *m_unwritten_workqueue; + struct workqueue_struct *m_cil_workqueue; + struct workqueue_struct *m_reclaim_workqueue; + struct workqueue_struct *m_log_workqueue; + struct workqueue_struct *m_eofblocks_workqueue; + + /* + * Generation of the filesysyem layout. This is incremented by each + * growfs, and used by the pNFS server to ensure the client updates + * its view of the block device once it gets a layout that might + * reference the newly added blocks. Does not need to be persistent + * as long as we only allow file system size increments, but if we + * ever support shrinks it would have to be persisted in addition + * to various other kinds of pain inflicted on the pNFS server. + */ + __uint32_t m_generation; +} xfs_mount_t; + +/* + * Flags for m_flags. + */ +#define XFS_MOUNT_WSYNC (1ULL << 0) /* for nfs - all metadata ops + must be synchronous except + for space allocations */ +#define XFS_MOUNT_WAS_CLEAN (1ULL << 3) +#define XFS_MOUNT_FS_SHUTDOWN (1ULL << 4) /* atomic stop of all filesystem + operations, typically for + disk errors in metadata */ +#define XFS_MOUNT_DISCARD (1ULL << 5) /* discard unused blocks */ +#define XFS_MOUNT_NOALIGN (1ULL << 7) /* turn off stripe alignment + allocations */ +#define XFS_MOUNT_ATTR2 (1ULL << 8) /* allow use of attr2 format */ +#define XFS_MOUNT_GRPID (1ULL << 9) /* group-ID assigned from directory */ +#define XFS_MOUNT_NORECOVERY (1ULL << 10) /* no recovery - dirty fs */ +#define XFS_MOUNT_DFLT_IOSIZE (1ULL << 12) /* set default i/o size */ +#define XFS_MOUNT_32BITINODES (1ULL << 14) /* do not create inodes above + * 32 bits in size */ +#define XFS_MOUNT_SMALL_INUMS (1ULL << 15) /* users wants 32bit inodes */ +#define XFS_MOUNT_NOUUID (1ULL << 16) /* ignore uuid during mount */ +#define XFS_MOUNT_BARRIER (1ULL << 17) +#define XFS_MOUNT_IKEEP (1ULL << 18) /* keep empty inode clusters*/ +#define XFS_MOUNT_SWALLOC (1ULL << 19) /* turn on stripe width + * allocation */ +#define XFS_MOUNT_RDONLY (1ULL << 20) /* read-only fs */ +#define XFS_MOUNT_DIRSYNC (1ULL << 21) /* synchronous directory ops */ +#define XFS_MOUNT_COMPAT_IOSIZE (1ULL << 22) /* don't report large preferred + * I/O size in stat() */ +#define XFS_MOUNT_FILESTREAMS (1ULL << 24) /* enable the filestreams + allocator */ +#define XFS_MOUNT_NOATTR2 (1ULL << 25) /* disable use of attr2 format */ + + +/* + * Default minimum read and write sizes. + */ +#define XFS_READIO_LOG_LARGE 16 +#define XFS_WRITEIO_LOG_LARGE 16 + +/* + * Max and min values for mount-option defined I/O + * preallocation sizes. + */ +#define XFS_MAX_IO_LOG 30 /* 1G */ +#define XFS_MIN_IO_LOG PAGE_SHIFT + +/* + * Synchronous read and write sizes. This should be + * better for NFSv2 wsync filesystems. + */ +#define XFS_WSYNC_READIO_LOG 15 /* 32k */ +#define XFS_WSYNC_WRITEIO_LOG 14 /* 16k */ + +/* + * Allow large block sizes to be reported to userspace programs if the + * "largeio" mount option is used. + * + * If compatibility mode is specified, simply return the basic unit of caching + * so that we don't get inefficient read/modify/write I/O from user apps. + * Otherwise.... + * + * If the underlying volume is a stripe, then return the stripe width in bytes + * as the recommended I/O size. It is not a stripe and we've set a default + * buffered I/O size, return that, otherwise return the compat default. + */ +static inline unsigned long +xfs_preferred_iosize(xfs_mount_t *mp) +{ + if (mp->m_flags & XFS_MOUNT_COMPAT_IOSIZE) + return PAGE_CACHE_SIZE; + return (mp->m_swidth ? + (mp->m_swidth << mp->m_sb.sb_blocklog) : + ((mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) ? + (1 << (int)MAX(mp->m_readio_log, mp->m_writeio_log)) : + PAGE_CACHE_SIZE)); +} + +#define XFS_LAST_UNMOUNT_WAS_CLEAN(mp) \ + ((mp)->m_flags & XFS_MOUNT_WAS_CLEAN) +#define XFS_FORCED_SHUTDOWN(mp) ((mp)->m_flags & XFS_MOUNT_FS_SHUTDOWN) +void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname, + int lnnum); +#define xfs_force_shutdown(m,f) \ + xfs_do_force_shutdown(m, f, __FILE__, __LINE__) + +#define SHUTDOWN_META_IO_ERROR 0x0001 /* write attempt to metadata failed */ +#define SHUTDOWN_LOG_IO_ERROR 0x0002 /* write attempt to the log failed */ +#define SHUTDOWN_FORCE_UMOUNT 0x0004 /* shutdown from a forced unmount */ +#define SHUTDOWN_CORRUPT_INCORE 0x0008 /* corrupt in-memory data structures */ +#define SHUTDOWN_REMOTE_REQ 0x0010 /* shutdown came from remote cell */ +#define SHUTDOWN_DEVICE_REQ 0x0020 /* failed all paths to the device */ + +/* + * Flags for xfs_mountfs + */ +#define XFS_MFSI_QUIET 0x40 /* Be silent if mount errors found */ + +static inline xfs_agnumber_t +xfs_daddr_to_agno(struct xfs_mount *mp, xfs_daddr_t d) +{ + xfs_daddr_t ld = XFS_BB_TO_FSBT(mp, d); + do_div(ld, mp->m_sb.sb_agblocks); + return (xfs_agnumber_t) ld; +} + +static inline xfs_agblock_t +xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d) +{ + xfs_daddr_t ld = XFS_BB_TO_FSBT(mp, d); + return (xfs_agblock_t) do_div(ld, mp->m_sb.sb_agblocks); +} + +/* + * Per-ag incore structure, copies of information in agf and agi, to improve the + * performance of allocation group selection. + */ +typedef struct xfs_perag { + struct xfs_mount *pag_mount; /* owner filesystem */ + xfs_agnumber_t pag_agno; /* AG this structure belongs to */ + atomic_t pag_ref; /* perag reference count */ + char pagf_init; /* this agf's entry is initialized */ + char pagi_init; /* this agi's entry is initialized */ + char pagf_metadata; /* the agf is preferred to be metadata */ + char pagi_inodeok; /* The agi is ok for inodes */ + __uint8_t pagf_levels[XFS_BTNUM_AGF]; + /* # of levels in bno & cnt btree */ + __uint32_t pagf_flcount; /* count of blocks in freelist */ + xfs_extlen_t pagf_freeblks; /* total free blocks */ + xfs_extlen_t pagf_longest; /* longest free space */ + __uint32_t pagf_btreeblks; /* # of blocks held in AGF btrees */ + xfs_agino_t pagi_freecount; /* number of free inodes */ + xfs_agino_t pagi_count; /* number of allocated inodes */ + + /* + * Inode allocation search lookup optimisation. + * If the pagino matches, the search for new inodes + * doesn't need to search the near ones again straight away + */ + xfs_agino_t pagl_pagino; + xfs_agino_t pagl_leftrec; + xfs_agino_t pagl_rightrec; + spinlock_t pagb_lock; /* lock for pagb_tree */ + struct rb_root pagb_tree; /* ordered tree of busy extents */ + + atomic_t pagf_fstrms; /* # of filestreams active in this AG */ + + spinlock_t pag_ici_lock; /* incore inode cache lock */ + struct radix_tree_root pag_ici_root; /* incore inode cache root */ + int pag_ici_reclaimable; /* reclaimable inodes */ + struct mutex pag_ici_reclaim_lock; /* serialisation point */ + unsigned long pag_ici_reclaim_cursor; /* reclaim restart point */ + + /* buffer cache index */ + spinlock_t pag_buf_lock; /* lock for pag_buf_tree */ + struct rb_root pag_buf_tree; /* ordered tree of active buffers */ + + /* for rcu-safe freeing */ + struct rcu_head rcu_head; + int pagb_count; /* pagb slots in use */ +} xfs_perag_t; + +extern int xfs_log_sbcount(xfs_mount_t *); +extern __uint64_t xfs_default_resblks(xfs_mount_t *mp); +extern int xfs_mountfs(xfs_mount_t *mp); +extern int xfs_initialize_perag(xfs_mount_t *mp, xfs_agnumber_t agcount, + xfs_agnumber_t *maxagi); +extern void xfs_unmountfs(xfs_mount_t *); + +extern int xfs_mod_icount(struct xfs_mount *mp, int64_t delta); +extern int xfs_mod_ifree(struct xfs_mount *mp, int64_t delta); +extern int xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta, + bool reserved); +extern int xfs_mod_frextents(struct xfs_mount *mp, int64_t delta); + +extern int xfs_mount_log_sb(xfs_mount_t *); +extern struct xfs_buf *xfs_getsb(xfs_mount_t *, int); +extern int xfs_readsb(xfs_mount_t *, int); +extern void xfs_freesb(xfs_mount_t *); +extern bool xfs_fs_writable(struct xfs_mount *mp, int level); +extern int xfs_sb_validate_fsb_count(struct xfs_sb *, __uint64_t); + +extern int xfs_dev_is_read_only(struct xfs_mount *, char *); + +extern void xfs_set_low_space_thresholds(struct xfs_mount *); + +#endif /* __XFS_MOUNT_H__ */ diff --git a/kernel/fs/xfs/xfs_mru_cache.c b/kernel/fs/xfs/xfs_mru_cache.c new file mode 100644 index 000000000..f8a674d7f --- /dev/null +++ b/kernel/fs/xfs/xfs_mru_cache.c @@ -0,0 +1,552 @@ +/* + * Copyright (c) 2006-2007 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_mru_cache.h" + +/* + * The MRU Cache data structure consists of a data store, an array of lists and + * a lock to protect its internal state. At initialisation time, the client + * supplies an element lifetime in milliseconds and a group count, as well as a + * function pointer to call when deleting elements. A data structure for + * queueing up work in the form of timed callbacks is also included. + * + * The group count controls how many lists are created, and thereby how finely + * the elements are grouped in time. When reaping occurs, all the elements in + * all the lists whose time has expired are deleted. + * + * To give an example of how this works in practice, consider a client that + * initialises an MRU Cache with a lifetime of ten seconds and a group count of + * five. Five internal lists will be created, each representing a two second + * period in time. When the first element is added, time zero for the data + * structure is initialised to the current time. + * + * All the elements added in the first two seconds are appended to the first + * list. Elements added in the third second go into the second list, and so on. + * If an element is accessed at any point, it is removed from its list and + * inserted at the head of the current most-recently-used list. + * + * The reaper function will have nothing to do until at least twelve seconds + * have elapsed since the first element was added. The reason for this is that + * if it were called at t=11s, there could be elements in the first list that + * have only been inactive for nine seconds, so it still does nothing. If it is + * called anywhere between t=12 and t=14 seconds, it will delete all the + * elements that remain in the first list. It's therefore possible for elements + * to remain in the data store even after they've been inactive for up to + * (t + t/g) seconds, where t is the inactive element lifetime and g is the + * number of groups. + * + * The above example assumes that the reaper function gets called at least once + * every (t/g) seconds. If it is called less frequently, unused elements will + * accumulate in the reap list until the reaper function is eventually called. + * The current implementation uses work queue callbacks to carefully time the + * reaper function calls, so this should happen rarely, if at all. + * + * From a design perspective, the primary reason for the choice of a list array + * representing discrete time intervals is that it's only practical to reap + * expired elements in groups of some appreciable size. This automatically + * introduces a granularity to element lifetimes, so there's no point storing an + * individual timeout with each element that specifies a more precise reap time. + * The bonus is a saving of sizeof(long) bytes of memory per element stored. + * + * The elements could have been stored in just one list, but an array of + * counters or pointers would need to be maintained to allow them to be divided + * up into discrete time groups. More critically, the process of touching or + * removing an element would involve walking large portions of the entire list, + * which would have a detrimental effect on performance. The additional memory + * requirement for the array of list heads is minimal. + * + * When an element is touched or deleted, it needs to be removed from its + * current list. Doubly linked lists are used to make the list maintenance + * portion of these operations O(1). Since reaper timing can be imprecise, + * inserts and lookups can occur when there are no free lists available. When + * this happens, all the elements on the LRU list need to be migrated to the end + * of the reap list. To keep the list maintenance portion of these operations + * O(1) also, list tails need to be accessible without walking the entire list. + * This is the reason why doubly linked list heads are used. + */ + +/* + * An MRU Cache is a dynamic data structure that stores its elements in a way + * that allows efficient lookups, but also groups them into discrete time + * intervals based on insertion time. This allows elements to be efficiently + * and automatically reaped after a fixed period of inactivity. + * + * When a client data pointer is stored in the MRU Cache it needs to be added to + * both the data store and to one of the lists. It must also be possible to + * access each of these entries via the other, i.e. to: + * + * a) Walk a list, removing the corresponding data store entry for each item. + * b) Look up a data store entry, then access its list entry directly. + * + * To achieve both of these goals, each entry must contain both a list entry and + * a key, in addition to the user's data pointer. Note that it's not a good + * idea to have the client embed one of these structures at the top of their own + * data structure, because inserting the same item more than once would most + * likely result in a loop in one of the lists. That's a sure-fire recipe for + * an infinite loop in the code. + */ +struct xfs_mru_cache { + struct radix_tree_root store; /* Core storage data structure. */ + struct list_head *lists; /* Array of lists, one per grp. */ + struct list_head reap_list; /* Elements overdue for reaping. */ + spinlock_t lock; /* Lock to protect this struct. */ + unsigned int grp_count; /* Number of discrete groups. */ + unsigned int grp_time; /* Time period spanned by grps. */ + unsigned int lru_grp; /* Group containing time zero. */ + unsigned long time_zero; /* Time first element was added. */ + xfs_mru_cache_free_func_t free_func; /* Function pointer for freeing. */ + struct delayed_work work; /* Workqueue data for reaping. */ + unsigned int queued; /* work has been queued */ +}; + +static struct workqueue_struct *xfs_mru_reap_wq; + +/* + * When inserting, destroying or reaping, it's first necessary to update the + * lists relative to a particular time. In the case of destroying, that time + * will be well in the future to ensure that all items are moved to the reap + * list. In all other cases though, the time will be the current time. + * + * This function enters a loop, moving the contents of the LRU list to the reap + * list again and again until either a) the lists are all empty, or b) time zero + * has been advanced sufficiently to be within the immediate element lifetime. + * + * Case a) above is detected by counting how many groups are migrated and + * stopping when they've all been moved. Case b) is detected by monitoring the + * time_zero field, which is updated as each group is migrated. + * + * The return value is the earliest time that more migration could be needed, or + * zero if there's no need to schedule more work because the lists are empty. + */ +STATIC unsigned long +_xfs_mru_cache_migrate( + struct xfs_mru_cache *mru, + unsigned long now) +{ + unsigned int grp; + unsigned int migrated = 0; + struct list_head *lru_list; + + /* Nothing to do if the data store is empty. */ + if (!mru->time_zero) + return 0; + + /* While time zero is older than the time spanned by all the lists. */ + while (mru->time_zero <= now - mru->grp_count * mru->grp_time) { + + /* + * If the LRU list isn't empty, migrate its elements to the tail + * of the reap list. + */ + lru_list = mru->lists + mru->lru_grp; + if (!list_empty(lru_list)) + list_splice_init(lru_list, mru->reap_list.prev); + + /* + * Advance the LRU group number, freeing the old LRU list to + * become the new MRU list; advance time zero accordingly. + */ + mru->lru_grp = (mru->lru_grp + 1) % mru->grp_count; + mru->time_zero += mru->grp_time; + + /* + * If reaping is so far behind that all the elements on all the + * lists have been migrated to the reap list, it's now empty. + */ + if (++migrated == mru->grp_count) { + mru->lru_grp = 0; + mru->time_zero = 0; + return 0; + } + } + + /* Find the first non-empty list from the LRU end. */ + for (grp = 0; grp < mru->grp_count; grp++) { + + /* Check the grp'th list from the LRU end. */ + lru_list = mru->lists + ((mru->lru_grp + grp) % mru->grp_count); + if (!list_empty(lru_list)) + return mru->time_zero + + (mru->grp_count + grp) * mru->grp_time; + } + + /* All the lists must be empty. */ + mru->lru_grp = 0; + mru->time_zero = 0; + return 0; +} + +/* + * When inserting or doing a lookup, an element needs to be inserted into the + * MRU list. The lists must be migrated first to ensure that they're + * up-to-date, otherwise the new element could be given a shorter lifetime in + * the cache than it should. + */ +STATIC void +_xfs_mru_cache_list_insert( + struct xfs_mru_cache *mru, + struct xfs_mru_cache_elem *elem) +{ + unsigned int grp = 0; + unsigned long now = jiffies; + + /* + * If the data store is empty, initialise time zero, leave grp set to + * zero and start the work queue timer if necessary. Otherwise, set grp + * to the number of group times that have elapsed since time zero. + */ + if (!_xfs_mru_cache_migrate(mru, now)) { + mru->time_zero = now; + if (!mru->queued) { + mru->queued = 1; + queue_delayed_work(xfs_mru_reap_wq, &mru->work, + mru->grp_count * mru->grp_time); + } + } else { + grp = (now - mru->time_zero) / mru->grp_time; + grp = (mru->lru_grp + grp) % mru->grp_count; + } + + /* Insert the element at the tail of the corresponding list. */ + list_add_tail(&elem->list_node, mru->lists + grp); +} + +/* + * When destroying or reaping, all the elements that were migrated to the reap + * list need to be deleted. For each element this involves removing it from the + * data store, removing it from the reap list, calling the client's free + * function and deleting the element from the element zone. + * + * We get called holding the mru->lock, which we drop and then reacquire. + * Sparse need special help with this to tell it we know what we are doing. + */ +STATIC void +_xfs_mru_cache_clear_reap_list( + struct xfs_mru_cache *mru) + __releases(mru->lock) __acquires(mru->lock) +{ + struct xfs_mru_cache_elem *elem, *next; + struct list_head tmp; + + INIT_LIST_HEAD(&tmp); + list_for_each_entry_safe(elem, next, &mru->reap_list, list_node) { + + /* Remove the element from the data store. */ + radix_tree_delete(&mru->store, elem->key); + + /* + * remove to temp list so it can be freed without + * needing to hold the lock + */ + list_move(&elem->list_node, &tmp); + } + spin_unlock(&mru->lock); + + list_for_each_entry_safe(elem, next, &tmp, list_node) { + list_del_init(&elem->list_node); + mru->free_func(elem); + } + + spin_lock(&mru->lock); +} + +/* + * We fire the reap timer every group expiry interval so + * we always have a reaper ready to run. This makes shutdown + * and flushing of the reaper easy to do. Hence we need to + * keep when the next reap must occur so we can determine + * at each interval whether there is anything we need to do. + */ +STATIC void +_xfs_mru_cache_reap( + struct work_struct *work) +{ + struct xfs_mru_cache *mru = + container_of(work, struct xfs_mru_cache, work.work); + unsigned long now, next; + + ASSERT(mru && mru->lists); + if (!mru || !mru->lists) + return; + + spin_lock(&mru->lock); + next = _xfs_mru_cache_migrate(mru, jiffies); + _xfs_mru_cache_clear_reap_list(mru); + + mru->queued = next; + if ((mru->queued > 0)) { + now = jiffies; + if (next <= now) + next = 0; + else + next -= now; + queue_delayed_work(xfs_mru_reap_wq, &mru->work, next); + } + + spin_unlock(&mru->lock); +} + +int +xfs_mru_cache_init(void) +{ + xfs_mru_reap_wq = alloc_workqueue("xfs_mru_cache", + WQ_MEM_RECLAIM|WQ_FREEZABLE, 1); + if (!xfs_mru_reap_wq) + return -ENOMEM; + return 0; +} + +void +xfs_mru_cache_uninit(void) +{ + destroy_workqueue(xfs_mru_reap_wq); +} + +/* + * To initialise a struct xfs_mru_cache pointer, call xfs_mru_cache_create() + * with the address of the pointer, a lifetime value in milliseconds, a group + * count and a free function to use when deleting elements. This function + * returns 0 if the initialisation was successful. + */ +int +xfs_mru_cache_create( + struct xfs_mru_cache **mrup, + unsigned int lifetime_ms, + unsigned int grp_count, + xfs_mru_cache_free_func_t free_func) +{ + struct xfs_mru_cache *mru = NULL; + int err = 0, grp; + unsigned int grp_time; + + if (mrup) + *mrup = NULL; + + if (!mrup || !grp_count || !lifetime_ms || !free_func) + return -EINVAL; + + if (!(grp_time = msecs_to_jiffies(lifetime_ms) / grp_count)) + return -EINVAL; + + if (!(mru = kmem_zalloc(sizeof(*mru), KM_SLEEP))) + return -ENOMEM; + + /* An extra list is needed to avoid reaping up to a grp_time early. */ + mru->grp_count = grp_count + 1; + mru->lists = kmem_zalloc(mru->grp_count * sizeof(*mru->lists), KM_SLEEP); + + if (!mru->lists) { + err = -ENOMEM; + goto exit; + } + + for (grp = 0; grp < mru->grp_count; grp++) + INIT_LIST_HEAD(mru->lists + grp); + + /* + * We use GFP_KERNEL radix tree preload and do inserts under a + * spinlock so GFP_ATOMIC is appropriate for the radix tree itself. + */ + INIT_RADIX_TREE(&mru->store, GFP_ATOMIC); + INIT_LIST_HEAD(&mru->reap_list); + spin_lock_init(&mru->lock); + INIT_DELAYED_WORK(&mru->work, _xfs_mru_cache_reap); + + mru->grp_time = grp_time; + mru->free_func = free_func; + + *mrup = mru; + +exit: + if (err && mru && mru->lists) + kmem_free(mru->lists); + if (err && mru) + kmem_free(mru); + + return err; +} + +/* + * Call xfs_mru_cache_flush() to flush out all cached entries, calling their + * free functions as they're deleted. When this function returns, the caller is + * guaranteed that all the free functions for all the elements have finished + * executing and the reaper is not running. + */ +static void +xfs_mru_cache_flush( + struct xfs_mru_cache *mru) +{ + if (!mru || !mru->lists) + return; + + spin_lock(&mru->lock); + if (mru->queued) { + spin_unlock(&mru->lock); + cancel_delayed_work_sync(&mru->work); + spin_lock(&mru->lock); + } + + _xfs_mru_cache_migrate(mru, jiffies + mru->grp_count * mru->grp_time); + _xfs_mru_cache_clear_reap_list(mru); + + spin_unlock(&mru->lock); +} + +void +xfs_mru_cache_destroy( + struct xfs_mru_cache *mru) +{ + if (!mru || !mru->lists) + return; + + xfs_mru_cache_flush(mru); + + kmem_free(mru->lists); + kmem_free(mru); +} + +/* + * To insert an element, call xfs_mru_cache_insert() with the data store, the + * element's key and the client data pointer. This function returns 0 on + * success or ENOMEM if memory for the data element couldn't be allocated. + */ +int +xfs_mru_cache_insert( + struct xfs_mru_cache *mru, + unsigned long key, + struct xfs_mru_cache_elem *elem) +{ + int error; + + ASSERT(mru && mru->lists); + if (!mru || !mru->lists) + return -EINVAL; + + if (radix_tree_preload(GFP_NOFS)) + return -ENOMEM; + + INIT_LIST_HEAD(&elem->list_node); + elem->key = key; + + spin_lock(&mru->lock); + error = radix_tree_insert(&mru->store, key, elem); + radix_tree_preload_end(); + if (!error) + _xfs_mru_cache_list_insert(mru, elem); + spin_unlock(&mru->lock); + + return error; +} + +/* + * To remove an element without calling the free function, call + * xfs_mru_cache_remove() with the data store and the element's key. On success + * the client data pointer for the removed element is returned, otherwise this + * function will return a NULL pointer. + */ +struct xfs_mru_cache_elem * +xfs_mru_cache_remove( + struct xfs_mru_cache *mru, + unsigned long key) +{ + struct xfs_mru_cache_elem *elem; + + ASSERT(mru && mru->lists); + if (!mru || !mru->lists) + return NULL; + + spin_lock(&mru->lock); + elem = radix_tree_delete(&mru->store, key); + if (elem) + list_del(&elem->list_node); + spin_unlock(&mru->lock); + + return elem; +} + +/* + * To remove and element and call the free function, call xfs_mru_cache_delete() + * with the data store and the element's key. + */ +void +xfs_mru_cache_delete( + struct xfs_mru_cache *mru, + unsigned long key) +{ + struct xfs_mru_cache_elem *elem; + + elem = xfs_mru_cache_remove(mru, key); + if (elem) + mru->free_func(elem); +} + +/* + * To look up an element using its key, call xfs_mru_cache_lookup() with the + * data store and the element's key. If found, the element will be moved to the + * head of the MRU list to indicate that it's been touched. + * + * The internal data structures are protected by a spinlock that is STILL HELD + * when this function returns. Call xfs_mru_cache_done() to release it. Note + * that it is not safe to call any function that might sleep in the interim. + * + * The implementation could have used reference counting to avoid this + * restriction, but since most clients simply want to get, set or test a member + * of the returned data structure, the extra per-element memory isn't warranted. + * + * If the element isn't found, this function returns NULL and the spinlock is + * released. xfs_mru_cache_done() should NOT be called when this occurs. + * + * Because sparse isn't smart enough to know about conditional lock return + * status, we need to help it get it right by annotating the path that does + * not release the lock. + */ +struct xfs_mru_cache_elem * +xfs_mru_cache_lookup( + struct xfs_mru_cache *mru, + unsigned long key) +{ + struct xfs_mru_cache_elem *elem; + + ASSERT(mru && mru->lists); + if (!mru || !mru->lists) + return NULL; + + spin_lock(&mru->lock); + elem = radix_tree_lookup(&mru->store, key); + if (elem) { + list_del(&elem->list_node); + _xfs_mru_cache_list_insert(mru, elem); + __release(mru_lock); /* help sparse not be stupid */ + } else + spin_unlock(&mru->lock); + + return elem; +} + +/* + * To release the internal data structure spinlock after having performed an + * xfs_mru_cache_lookup() or an xfs_mru_cache_peek(), call xfs_mru_cache_done() + * with the data store pointer. + */ +void +xfs_mru_cache_done( + struct xfs_mru_cache *mru) + __releases(mru->lock) +{ + spin_unlock(&mru->lock); +} diff --git a/kernel/fs/xfs/xfs_mru_cache.h b/kernel/fs/xfs/xfs_mru_cache.h new file mode 100644 index 000000000..fb5245ba5 --- /dev/null +++ b/kernel/fs/xfs/xfs_mru_cache.h @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2006-2007 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_MRU_CACHE_H__ +#define __XFS_MRU_CACHE_H__ + +struct xfs_mru_cache; + +struct xfs_mru_cache_elem { + struct list_head list_node; + unsigned long key; +}; + +/* Function pointer type for callback to free a client's data pointer. */ +typedef void (*xfs_mru_cache_free_func_t)(struct xfs_mru_cache_elem *elem); + +int xfs_mru_cache_init(void); +void xfs_mru_cache_uninit(void); +int xfs_mru_cache_create(struct xfs_mru_cache **mrup, unsigned int lifetime_ms, + unsigned int grp_count, + xfs_mru_cache_free_func_t free_func); +void xfs_mru_cache_destroy(struct xfs_mru_cache *mru); +int xfs_mru_cache_insert(struct xfs_mru_cache *mru, unsigned long key, + struct xfs_mru_cache_elem *elem); +struct xfs_mru_cache_elem * +xfs_mru_cache_remove(struct xfs_mru_cache *mru, unsigned long key); +void xfs_mru_cache_delete(struct xfs_mru_cache *mru, unsigned long key); +struct xfs_mru_cache_elem * +xfs_mru_cache_lookup(struct xfs_mru_cache *mru, unsigned long key); +void xfs_mru_cache_done(struct xfs_mru_cache *mru); + +#endif /* __XFS_MRU_CACHE_H__ */ diff --git a/kernel/fs/xfs/xfs_pnfs.c b/kernel/fs/xfs/xfs_pnfs.c new file mode 100644 index 000000000..981a657ec --- /dev/null +++ b/kernel/fs/xfs/xfs_pnfs.c @@ -0,0 +1,329 @@ +/* + * Copyright (c) 2014 Christoph Hellwig. + */ +#include "xfs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_log.h" +#include "xfs_bmap.h" +#include "xfs_bmap_util.h" +#include "xfs_error.h" +#include "xfs_iomap.h" +#include "xfs_shared.h" +#include "xfs_bit.h" +#include "xfs_pnfs.h" + +/* + * Ensure that we do not have any outstanding pNFS layouts that can be used by + * clients to directly read from or write to this inode. This must be called + * before every operation that can remove blocks from the extent map. + * Additionally we call it during the write operation, where aren't concerned + * about exposing unallocated blocks but just want to provide basic + * synchronization between a local writer and pNFS clients. mmap writes would + * also benefit from this sort of synchronization, but due to the tricky locking + * rules in the page fault path we don't bother. + */ +int +xfs_break_layouts( + struct inode *inode, + uint *iolock, + bool with_imutex) +{ + struct xfs_inode *ip = XFS_I(inode); + int error; + + ASSERT(xfs_isilocked(ip, XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)); + + while ((error = break_layout(inode, false) == -EWOULDBLOCK)) { + xfs_iunlock(ip, *iolock); + if (with_imutex && (*iolock & XFS_IOLOCK_EXCL)) + mutex_unlock(&inode->i_mutex); + error = break_layout(inode, true); + *iolock = XFS_IOLOCK_EXCL; + if (with_imutex) + mutex_lock(&inode->i_mutex); + xfs_ilock(ip, *iolock); + } + + return error; +} + +/* + * Get a unique ID including its location so that the client can identify + * the exported device. + */ +int +xfs_fs_get_uuid( + struct super_block *sb, + u8 *buf, + u32 *len, + u64 *offset) +{ + struct xfs_mount *mp = XFS_M(sb); + + printk_once(KERN_NOTICE +"XFS (%s): using experimental pNFS feature, use at your own risk!\n", + mp->m_fsname); + + if (*len < sizeof(uuid_t)) + return -EINVAL; + + memcpy(buf, &mp->m_sb.sb_uuid, sizeof(uuid_t)); + *len = sizeof(uuid_t); + *offset = offsetof(struct xfs_dsb, sb_uuid); + return 0; +} + +static void +xfs_bmbt_to_iomap( + struct xfs_inode *ip, + struct iomap *iomap, + struct xfs_bmbt_irec *imap) +{ + struct xfs_mount *mp = ip->i_mount; + + if (imap->br_startblock == HOLESTARTBLOCK) { + iomap->blkno = IOMAP_NULL_BLOCK; + iomap->type = IOMAP_HOLE; + } else if (imap->br_startblock == DELAYSTARTBLOCK) { + iomap->blkno = IOMAP_NULL_BLOCK; + iomap->type = IOMAP_DELALLOC; + } else { + iomap->blkno = + XFS_FSB_TO_DADDR(ip->i_mount, imap->br_startblock); + if (imap->br_state == XFS_EXT_UNWRITTEN) + iomap->type = IOMAP_UNWRITTEN; + else + iomap->type = IOMAP_MAPPED; + } + iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff); + iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount); +} + +/* + * Get a layout for the pNFS client. + */ +int +xfs_fs_map_blocks( + struct inode *inode, + loff_t offset, + u64 length, + struct iomap *iomap, + bool write, + u32 *device_generation) +{ + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + struct xfs_bmbt_irec imap; + xfs_fileoff_t offset_fsb, end_fsb; + loff_t limit; + int bmapi_flags = XFS_BMAPI_ENTIRE; + int nimaps = 1; + uint lock_flags; + int error = 0; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + /* + * We can't export inodes residing on the realtime device. The realtime + * device doesn't have a UUID to identify it, so the client has no way + * to find it. + */ + if (XFS_IS_REALTIME_INODE(ip)) + return -ENXIO; + + /* + * Lock out any other I/O before we flush and invalidate the pagecache, + * and then hand out a layout to the remote system. This is very + * similar to direct I/O, except that the synchronization is much more + * complicated. See the comment near xfs_break_layouts for a detailed + * explanation. + */ + xfs_ilock(ip, XFS_IOLOCK_EXCL); + + error = -EINVAL; + limit = mp->m_super->s_maxbytes; + if (!write) + limit = max(limit, round_up(i_size_read(inode), + inode->i_sb->s_blocksize)); + if (offset > limit) + goto out_unlock; + if (offset > limit - length) + length = limit - offset; + + error = filemap_write_and_wait(inode->i_mapping); + if (error) + goto out_unlock; + error = invalidate_inode_pages2(inode->i_mapping); + if (WARN_ON_ONCE(error)) + return error; + + end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + length); + offset_fsb = XFS_B_TO_FSBT(mp, offset); + + lock_flags = xfs_ilock_data_map_shared(ip); + error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, + &imap, &nimaps, bmapi_flags); + xfs_iunlock(ip, lock_flags); + + if (error) + goto out_unlock; + + if (write) { + enum xfs_prealloc_flags flags = 0; + + ASSERT(imap.br_startblock != DELAYSTARTBLOCK); + + if (!nimaps || imap.br_startblock == HOLESTARTBLOCK) { + error = xfs_iomap_write_direct(ip, offset, length, + &imap, nimaps); + if (error) + goto out_unlock; + + /* + * Ensure the next transaction is committed + * synchronously so that the blocks allocated and + * handed out to the client are guaranteed to be + * present even after a server crash. + */ + flags |= XFS_PREALLOC_SET | XFS_PREALLOC_SYNC; + } + + error = xfs_update_prealloc_flags(ip, flags); + if (error) + goto out_unlock; + } + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + + xfs_bmbt_to_iomap(ip, iomap, &imap); + *device_generation = mp->m_generation; + return error; +out_unlock: + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + return error; +} + +/* + * Ensure the size update falls into a valid allocated block. + */ +static int +xfs_pnfs_validate_isize( + struct xfs_inode *ip, + xfs_off_t isize) +{ + struct xfs_bmbt_irec imap; + int nimaps = 1; + int error = 0; + + xfs_ilock(ip, XFS_ILOCK_SHARED); + error = xfs_bmapi_read(ip, XFS_B_TO_FSBT(ip->i_mount, isize - 1), 1, + &imap, &nimaps, 0); + xfs_iunlock(ip, XFS_ILOCK_SHARED); + if (error) + return error; + + if (imap.br_startblock == HOLESTARTBLOCK || + imap.br_startblock == DELAYSTARTBLOCK || + imap.br_state == XFS_EXT_UNWRITTEN) + return -EIO; + return 0; +} + +/* + * Make sure the blocks described by maps are stable on disk. This includes + * converting any unwritten extents, flushing the disk cache and updating the + * time stamps. + * + * Note that we rely on the caller to always send us a timestamp update so that + * we always commit a transaction here. If that stops being true we will have + * to manually flush the cache here similar to what the fsync code path does + * for datasyncs on files that have no dirty metadata. + */ +int +xfs_fs_commit_blocks( + struct inode *inode, + struct iomap *maps, + int nr_maps, + struct iattr *iattr) +{ + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + bool update_isize = false; + int error, i; + loff_t size; + + ASSERT(iattr->ia_valid & (ATTR_ATIME|ATTR_CTIME|ATTR_MTIME)); + + xfs_ilock(ip, XFS_IOLOCK_EXCL); + + size = i_size_read(inode); + if ((iattr->ia_valid & ATTR_SIZE) && iattr->ia_size > size) { + update_isize = true; + size = iattr->ia_size; + } + + for (i = 0; i < nr_maps; i++) { + u64 start, length, end; + + start = maps[i].offset; + if (start > size) + continue; + + end = start + maps[i].length; + if (end > size) + end = size; + + length = end - start; + if (!length) + continue; + + /* + * Make sure reads through the pagecache see the new data. + */ + error = invalidate_inode_pages2_range(inode->i_mapping, + start >> PAGE_CACHE_SHIFT, + (end - 1) >> PAGE_CACHE_SHIFT); + WARN_ON_ONCE(error); + + error = xfs_iomap_write_unwritten(ip, start, length); + if (error) + goto out_drop_iolock; + } + + if (update_isize) { + error = xfs_pnfs_validate_isize(ip, size); + if (error) + goto out_drop_iolock; + } + + tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0); + if (error) { + xfs_trans_cancel(tp, 0); + goto out_drop_iolock; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + xfs_setattr_time(ip, iattr); + if (update_isize) { + i_size_write(inode, iattr->ia_size); + ip->i_d.di_size = iattr->ia_size; + } + + xfs_trans_set_sync(tp); + error = xfs_trans_commit(tp, 0); + +out_drop_iolock: + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + return error; +} diff --git a/kernel/fs/xfs/xfs_pnfs.h b/kernel/fs/xfs/xfs_pnfs.h new file mode 100644 index 000000000..8147ac108 --- /dev/null +++ b/kernel/fs/xfs/xfs_pnfs.h @@ -0,0 +1,19 @@ +#ifndef _XFS_PNFS_H +#define _XFS_PNFS_H 1 + +#ifdef CONFIG_NFSD_PNFS +int xfs_fs_get_uuid(struct super_block *sb, u8 *buf, u32 *len, u64 *offset); +int xfs_fs_map_blocks(struct inode *inode, loff_t offset, u64 length, + struct iomap *iomap, bool write, u32 *device_generation); +int xfs_fs_commit_blocks(struct inode *inode, struct iomap *maps, int nr_maps, + struct iattr *iattr); + +int xfs_break_layouts(struct inode *inode, uint *iolock, bool with_imutex); +#else +static inline int +xfs_break_layouts(struct inode *inode, uint *iolock, bool with_imutex) +{ + return 0; +} +#endif /* CONFIG_NFSD_PNFS */ +#endif /* _XFS_PNFS_H */ diff --git a/kernel/fs/xfs/xfs_qm.c b/kernel/fs/xfs/xfs_qm.c new file mode 100644 index 000000000..5538468c7 --- /dev/null +++ b/kernel/fs/xfs/xfs_qm.c @@ -0,0 +1,1939 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_ialloc.h" +#include "xfs_itable.h" +#include "xfs_quota.h" +#include "xfs_error.h" +#include "xfs_bmap.h" +#include "xfs_bmap_btree.h" +#include "xfs_trans.h" +#include "xfs_trans_space.h" +#include "xfs_qm.h" +#include "xfs_trace.h" +#include "xfs_icache.h" +#include "xfs_cksum.h" + +/* + * The global quota manager. There is only one of these for the entire + * system, _not_ one per file system. XQM keeps track of the overall + * quota functionality, including maintaining the freelist and hash + * tables of dquots. + */ +STATIC int xfs_qm_init_quotainos(xfs_mount_t *); +STATIC int xfs_qm_init_quotainfo(xfs_mount_t *); + + +STATIC void xfs_qm_dqfree_one(struct xfs_dquot *dqp); +/* + * We use the batch lookup interface to iterate over the dquots as it + * currently is the only interface into the radix tree code that allows + * fuzzy lookups instead of exact matches. Holding the lock over multiple + * operations is fine as all callers are used either during mount/umount + * or quotaoff. + */ +#define XFS_DQ_LOOKUP_BATCH 32 + +STATIC int +xfs_qm_dquot_walk( + struct xfs_mount *mp, + int type, + int (*execute)(struct xfs_dquot *dqp, void *data), + void *data) +{ + struct xfs_quotainfo *qi = mp->m_quotainfo; + struct radix_tree_root *tree = xfs_dquot_tree(qi, type); + uint32_t next_index; + int last_error = 0; + int skipped; + int nr_found; + +restart: + skipped = 0; + next_index = 0; + nr_found = 0; + + while (1) { + struct xfs_dquot *batch[XFS_DQ_LOOKUP_BATCH]; + int error = 0; + int i; + + mutex_lock(&qi->qi_tree_lock); + nr_found = radix_tree_gang_lookup(tree, (void **)batch, + next_index, XFS_DQ_LOOKUP_BATCH); + if (!nr_found) { + mutex_unlock(&qi->qi_tree_lock); + break; + } + + for (i = 0; i < nr_found; i++) { + struct xfs_dquot *dqp = batch[i]; + + next_index = be32_to_cpu(dqp->q_core.d_id) + 1; + + error = execute(batch[i], data); + if (error == -EAGAIN) { + skipped++; + continue; + } + if (error && last_error != -EFSCORRUPTED) + last_error = error; + } + + mutex_unlock(&qi->qi_tree_lock); + + /* bail out if the filesystem is corrupted. */ + if (last_error == -EFSCORRUPTED) { + skipped = 0; + break; + } + } + + if (skipped) { + delay(1); + goto restart; + } + + return last_error; +} + + +/* + * Purge a dquot from all tracking data structures and free it. + */ +STATIC int +xfs_qm_dqpurge( + struct xfs_dquot *dqp, + void *data) +{ + struct xfs_mount *mp = dqp->q_mount; + struct xfs_quotainfo *qi = mp->m_quotainfo; + + xfs_dqlock(dqp); + if ((dqp->dq_flags & XFS_DQ_FREEING) || dqp->q_nrefs != 0) { + xfs_dqunlock(dqp); + return -EAGAIN; + } + + dqp->dq_flags |= XFS_DQ_FREEING; + + xfs_dqflock(dqp); + + /* + * If we are turning this type of quotas off, we don't care + * about the dirty metadata sitting in this dquot. OTOH, if + * we're unmounting, we do care, so we flush it and wait. + */ + if (XFS_DQ_IS_DIRTY(dqp)) { + struct xfs_buf *bp = NULL; + int error; + + /* + * We don't care about getting disk errors here. We need + * to purge this dquot anyway, so we go ahead regardless. + */ + error = xfs_qm_dqflush(dqp, &bp); + if (error) { + xfs_warn(mp, "%s: dquot %p flush failed", + __func__, dqp); + } else { + error = xfs_bwrite(bp); + xfs_buf_relse(bp); + } + xfs_dqflock(dqp); + } + + ASSERT(atomic_read(&dqp->q_pincount) == 0); + ASSERT(XFS_FORCED_SHUTDOWN(mp) || + !(dqp->q_logitem.qli_item.li_flags & XFS_LI_IN_AIL)); + + xfs_dqfunlock(dqp); + xfs_dqunlock(dqp); + + radix_tree_delete(xfs_dquot_tree(qi, dqp->q_core.d_flags), + be32_to_cpu(dqp->q_core.d_id)); + qi->qi_dquots--; + + /* + * We move dquots to the freelist as soon as their reference count + * hits zero, so it really should be on the freelist here. + */ + ASSERT(!list_empty(&dqp->q_lru)); + list_lru_del(&qi->qi_lru, &dqp->q_lru); + XFS_STATS_DEC(xs_qm_dquot_unused); + + xfs_qm_dqdestroy(dqp); + return 0; +} + +/* + * Purge the dquot cache. + */ +void +xfs_qm_dqpurge_all( + struct xfs_mount *mp, + uint flags) +{ + if (flags & XFS_QMOPT_UQUOTA) + xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_dqpurge, NULL); + if (flags & XFS_QMOPT_GQUOTA) + xfs_qm_dquot_walk(mp, XFS_DQ_GROUP, xfs_qm_dqpurge, NULL); + if (flags & XFS_QMOPT_PQUOTA) + xfs_qm_dquot_walk(mp, XFS_DQ_PROJ, xfs_qm_dqpurge, NULL); +} + +/* + * Just destroy the quotainfo structure. + */ +void +xfs_qm_unmount( + struct xfs_mount *mp) +{ + if (mp->m_quotainfo) { + xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL); + xfs_qm_destroy_quotainfo(mp); + } +} + +/* + * Called from the vfsops layer. + */ +void +xfs_qm_unmount_quotas( + xfs_mount_t *mp) +{ + /* + * Release the dquots that root inode, et al might be holding, + * before we flush quotas and blow away the quotainfo structure. + */ + ASSERT(mp->m_rootip); + xfs_qm_dqdetach(mp->m_rootip); + if (mp->m_rbmip) + xfs_qm_dqdetach(mp->m_rbmip); + if (mp->m_rsumip) + xfs_qm_dqdetach(mp->m_rsumip); + + /* + * Release the quota inodes. + */ + if (mp->m_quotainfo) { + if (mp->m_quotainfo->qi_uquotaip) { + IRELE(mp->m_quotainfo->qi_uquotaip); + mp->m_quotainfo->qi_uquotaip = NULL; + } + if (mp->m_quotainfo->qi_gquotaip) { + IRELE(mp->m_quotainfo->qi_gquotaip); + mp->m_quotainfo->qi_gquotaip = NULL; + } + if (mp->m_quotainfo->qi_pquotaip) { + IRELE(mp->m_quotainfo->qi_pquotaip); + mp->m_quotainfo->qi_pquotaip = NULL; + } + } +} + +STATIC int +xfs_qm_dqattach_one( + xfs_inode_t *ip, + xfs_dqid_t id, + uint type, + uint doalloc, + xfs_dquot_t **IO_idqpp) +{ + xfs_dquot_t *dqp; + int error; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + error = 0; + + /* + * See if we already have it in the inode itself. IO_idqpp is &i_udquot + * or &i_gdquot. This made the code look weird, but made the logic a lot + * simpler. + */ + dqp = *IO_idqpp; + if (dqp) { + trace_xfs_dqattach_found(dqp); + return 0; + } + + /* + * Find the dquot from somewhere. This bumps the reference count of + * dquot and returns it locked. This can return ENOENT if dquot didn't + * exist on disk and we didn't ask it to allocate; ESRCH if quotas got + * turned off suddenly. + */ + error = xfs_qm_dqget(ip->i_mount, ip, id, type, + doalloc | XFS_QMOPT_DOWARN, &dqp); + if (error) + return error; + + trace_xfs_dqattach_get(dqp); + + /* + * dqget may have dropped and re-acquired the ilock, but it guarantees + * that the dquot returned is the one that should go in the inode. + */ + *IO_idqpp = dqp; + xfs_dqunlock(dqp); + return 0; +} + +static bool +xfs_qm_need_dqattach( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + + if (!XFS_IS_QUOTA_RUNNING(mp)) + return false; + if (!XFS_IS_QUOTA_ON(mp)) + return false; + if (!XFS_NOT_DQATTACHED(mp, ip)) + return false; + if (xfs_is_quota_inode(&mp->m_sb, ip->i_ino)) + return false; + return true; +} + +/* + * Given a locked inode, attach dquot(s) to it, taking U/G/P-QUOTAON + * into account. + * If XFS_QMOPT_DQALLOC, the dquot(s) will be allocated if needed. + * Inode may get unlocked and relocked in here, and the caller must deal with + * the consequences. + */ +int +xfs_qm_dqattach_locked( + xfs_inode_t *ip, + uint flags) +{ + xfs_mount_t *mp = ip->i_mount; + int error = 0; + + if (!xfs_qm_need_dqattach(ip)) + return 0; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + + if (XFS_IS_UQUOTA_ON(mp) && !ip->i_udquot) { + error = xfs_qm_dqattach_one(ip, ip->i_d.di_uid, XFS_DQ_USER, + flags & XFS_QMOPT_DQALLOC, + &ip->i_udquot); + if (error) + goto done; + ASSERT(ip->i_udquot); + } + + if (XFS_IS_GQUOTA_ON(mp) && !ip->i_gdquot) { + error = xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP, + flags & XFS_QMOPT_DQALLOC, + &ip->i_gdquot); + if (error) + goto done; + ASSERT(ip->i_gdquot); + } + + if (XFS_IS_PQUOTA_ON(mp) && !ip->i_pdquot) { + error = xfs_qm_dqattach_one(ip, xfs_get_projid(ip), XFS_DQ_PROJ, + flags & XFS_QMOPT_DQALLOC, + &ip->i_pdquot); + if (error) + goto done; + ASSERT(ip->i_pdquot); + } + +done: + /* + * Don't worry about the dquots that we may have attached before any + * error - they'll get detached later if it has not already been done. + */ + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + return error; +} + +int +xfs_qm_dqattach( + struct xfs_inode *ip, + uint flags) +{ + int error; + + if (!xfs_qm_need_dqattach(ip)) + return 0; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_qm_dqattach_locked(ip, flags); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + + return error; +} + +/* + * Release dquots (and their references) if any. + * The inode should be locked EXCL except when this's called by + * xfs_ireclaim. + */ +void +xfs_qm_dqdetach( + xfs_inode_t *ip) +{ + if (!(ip->i_udquot || ip->i_gdquot || ip->i_pdquot)) + return; + + trace_xfs_dquot_dqdetach(ip); + + ASSERT(!xfs_is_quota_inode(&ip->i_mount->m_sb, ip->i_ino)); + if (ip->i_udquot) { + xfs_qm_dqrele(ip->i_udquot); + ip->i_udquot = NULL; + } + if (ip->i_gdquot) { + xfs_qm_dqrele(ip->i_gdquot); + ip->i_gdquot = NULL; + } + if (ip->i_pdquot) { + xfs_qm_dqrele(ip->i_pdquot); + ip->i_pdquot = NULL; + } +} + +struct xfs_qm_isolate { + struct list_head buffers; + struct list_head dispose; +}; + +static enum lru_status +xfs_qm_dquot_isolate( + struct list_head *item, + struct list_lru_one *lru, + spinlock_t *lru_lock, + void *arg) + __releases(lru_lock) __acquires(lru_lock) +{ + struct xfs_dquot *dqp = container_of(item, + struct xfs_dquot, q_lru); + struct xfs_qm_isolate *isol = arg; + + if (!xfs_dqlock_nowait(dqp)) + goto out_miss_busy; + + /* + * This dquot has acquired a reference in the meantime remove it from + * the freelist and try again. + */ + if (dqp->q_nrefs) { + xfs_dqunlock(dqp); + XFS_STATS_INC(xs_qm_dqwants); + + trace_xfs_dqreclaim_want(dqp); + list_lru_isolate(lru, &dqp->q_lru); + XFS_STATS_DEC(xs_qm_dquot_unused); + return LRU_REMOVED; + } + + /* + * If the dquot is dirty, flush it. If it's already being flushed, just + * skip it so there is time for the IO to complete before we try to + * reclaim it again on the next LRU pass. + */ + if (!xfs_dqflock_nowait(dqp)) { + xfs_dqunlock(dqp); + goto out_miss_busy; + } + + if (XFS_DQ_IS_DIRTY(dqp)) { + struct xfs_buf *bp = NULL; + int error; + + trace_xfs_dqreclaim_dirty(dqp); + + /* we have to drop the LRU lock to flush the dquot */ + spin_unlock(lru_lock); + + error = xfs_qm_dqflush(dqp, &bp); + if (error) { + xfs_warn(dqp->q_mount, "%s: dquot %p flush failed", + __func__, dqp); + goto out_unlock_dirty; + } + + xfs_buf_delwri_queue(bp, &isol->buffers); + xfs_buf_relse(bp); + goto out_unlock_dirty; + } + xfs_dqfunlock(dqp); + + /* + * Prevent lookups now that we are past the point of no return. + */ + dqp->dq_flags |= XFS_DQ_FREEING; + xfs_dqunlock(dqp); + + ASSERT(dqp->q_nrefs == 0); + list_lru_isolate_move(lru, &dqp->q_lru, &isol->dispose); + XFS_STATS_DEC(xs_qm_dquot_unused); + trace_xfs_dqreclaim_done(dqp); + XFS_STATS_INC(xs_qm_dqreclaims); + return LRU_REMOVED; + +out_miss_busy: + trace_xfs_dqreclaim_busy(dqp); + XFS_STATS_INC(xs_qm_dqreclaim_misses); + return LRU_SKIP; + +out_unlock_dirty: + trace_xfs_dqreclaim_busy(dqp); + XFS_STATS_INC(xs_qm_dqreclaim_misses); + xfs_dqunlock(dqp); + spin_lock(lru_lock); + return LRU_RETRY; +} + +static unsigned long +xfs_qm_shrink_scan( + struct shrinker *shrink, + struct shrink_control *sc) +{ + struct xfs_quotainfo *qi = container_of(shrink, + struct xfs_quotainfo, qi_shrinker); + struct xfs_qm_isolate isol; + unsigned long freed; + int error; + + if ((sc->gfp_mask & (__GFP_FS|__GFP_WAIT)) != (__GFP_FS|__GFP_WAIT)) + return 0; + + INIT_LIST_HEAD(&isol.buffers); + INIT_LIST_HEAD(&isol.dispose); + + freed = list_lru_shrink_walk(&qi->qi_lru, sc, + xfs_qm_dquot_isolate, &isol); + + error = xfs_buf_delwri_submit(&isol.buffers); + if (error) + xfs_warn(NULL, "%s: dquot reclaim failed", __func__); + + while (!list_empty(&isol.dispose)) { + struct xfs_dquot *dqp; + + dqp = list_first_entry(&isol.dispose, struct xfs_dquot, q_lru); + list_del_init(&dqp->q_lru); + xfs_qm_dqfree_one(dqp); + } + + return freed; +} + +static unsigned long +xfs_qm_shrink_count( + struct shrinker *shrink, + struct shrink_control *sc) +{ + struct xfs_quotainfo *qi = container_of(shrink, + struct xfs_quotainfo, qi_shrinker); + + return list_lru_shrink_count(&qi->qi_lru, sc); +} + +/* + * This initializes all the quota information that's kept in the + * mount structure + */ +STATIC int +xfs_qm_init_quotainfo( + xfs_mount_t *mp) +{ + xfs_quotainfo_t *qinf; + int error; + xfs_dquot_t *dqp; + + ASSERT(XFS_IS_QUOTA_RUNNING(mp)); + + qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), KM_SLEEP); + + error = list_lru_init(&qinf->qi_lru); + if (error) + goto out_free_qinf; + + /* + * See if quotainodes are setup, and if not, allocate them, + * and change the superblock accordingly. + */ + error = xfs_qm_init_quotainos(mp); + if (error) + goto out_free_lru; + + INIT_RADIX_TREE(&qinf->qi_uquota_tree, GFP_NOFS); + INIT_RADIX_TREE(&qinf->qi_gquota_tree, GFP_NOFS); + INIT_RADIX_TREE(&qinf->qi_pquota_tree, GFP_NOFS); + mutex_init(&qinf->qi_tree_lock); + + /* mutex used to serialize quotaoffs */ + mutex_init(&qinf->qi_quotaofflock); + + /* Precalc some constants */ + qinf->qi_dqchunklen = XFS_FSB_TO_BB(mp, XFS_DQUOT_CLUSTER_SIZE_FSB); + qinf->qi_dqperchunk = xfs_calc_dquots_per_chunk(qinf->qi_dqchunklen); + + mp->m_qflags |= (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_CHKD); + + /* + * We try to get the limits from the superuser's limits fields. + * This is quite hacky, but it is standard quota practice. + * + * We look at the USR dquot with id == 0 first, but if user quotas + * are not enabled we goto the GRP dquot with id == 0. + * We don't really care to keep separate default limits for user + * and group quotas, at least not at this point. + * + * Since we may not have done a quotacheck by this point, just read + * the dquot without attaching it to any hashtables or lists. + */ + error = xfs_qm_dqread(mp, 0, + XFS_IS_UQUOTA_RUNNING(mp) ? XFS_DQ_USER : + (XFS_IS_GQUOTA_RUNNING(mp) ? XFS_DQ_GROUP : + XFS_DQ_PROJ), + XFS_QMOPT_DOWARN, &dqp); + if (!error) { + xfs_disk_dquot_t *ddqp = &dqp->q_core; + + /* + * The warnings and timers set the grace period given to + * a user or group before he or she can not perform any + * more writing. If it is zero, a default is used. + */ + qinf->qi_btimelimit = ddqp->d_btimer ? + be32_to_cpu(ddqp->d_btimer) : XFS_QM_BTIMELIMIT; + qinf->qi_itimelimit = ddqp->d_itimer ? + be32_to_cpu(ddqp->d_itimer) : XFS_QM_ITIMELIMIT; + qinf->qi_rtbtimelimit = ddqp->d_rtbtimer ? + be32_to_cpu(ddqp->d_rtbtimer) : XFS_QM_RTBTIMELIMIT; + qinf->qi_bwarnlimit = ddqp->d_bwarns ? + be16_to_cpu(ddqp->d_bwarns) : XFS_QM_BWARNLIMIT; + qinf->qi_iwarnlimit = ddqp->d_iwarns ? + be16_to_cpu(ddqp->d_iwarns) : XFS_QM_IWARNLIMIT; + qinf->qi_rtbwarnlimit = ddqp->d_rtbwarns ? + be16_to_cpu(ddqp->d_rtbwarns) : XFS_QM_RTBWARNLIMIT; + qinf->qi_bhardlimit = be64_to_cpu(ddqp->d_blk_hardlimit); + qinf->qi_bsoftlimit = be64_to_cpu(ddqp->d_blk_softlimit); + qinf->qi_ihardlimit = be64_to_cpu(ddqp->d_ino_hardlimit); + qinf->qi_isoftlimit = be64_to_cpu(ddqp->d_ino_softlimit); + qinf->qi_rtbhardlimit = be64_to_cpu(ddqp->d_rtb_hardlimit); + qinf->qi_rtbsoftlimit = be64_to_cpu(ddqp->d_rtb_softlimit); + + xfs_qm_dqdestroy(dqp); + } else { + qinf->qi_btimelimit = XFS_QM_BTIMELIMIT; + qinf->qi_itimelimit = XFS_QM_ITIMELIMIT; + qinf->qi_rtbtimelimit = XFS_QM_RTBTIMELIMIT; + qinf->qi_bwarnlimit = XFS_QM_BWARNLIMIT; + qinf->qi_iwarnlimit = XFS_QM_IWARNLIMIT; + qinf->qi_rtbwarnlimit = XFS_QM_RTBWARNLIMIT; + } + + qinf->qi_shrinker.count_objects = xfs_qm_shrink_count; + qinf->qi_shrinker.scan_objects = xfs_qm_shrink_scan; + qinf->qi_shrinker.seeks = DEFAULT_SEEKS; + qinf->qi_shrinker.flags = SHRINKER_NUMA_AWARE; + register_shrinker(&qinf->qi_shrinker); + return 0; + +out_free_lru: + list_lru_destroy(&qinf->qi_lru); +out_free_qinf: + kmem_free(qinf); + mp->m_quotainfo = NULL; + return error; +} + + +/* + * Gets called when unmounting a filesystem or when all quotas get + * turned off. + * This purges the quota inodes, destroys locks and frees itself. + */ +void +xfs_qm_destroy_quotainfo( + xfs_mount_t *mp) +{ + xfs_quotainfo_t *qi; + + qi = mp->m_quotainfo; + ASSERT(qi != NULL); + + unregister_shrinker(&qi->qi_shrinker); + list_lru_destroy(&qi->qi_lru); + + if (qi->qi_uquotaip) { + IRELE(qi->qi_uquotaip); + qi->qi_uquotaip = NULL; /* paranoia */ + } + if (qi->qi_gquotaip) { + IRELE(qi->qi_gquotaip); + qi->qi_gquotaip = NULL; + } + if (qi->qi_pquotaip) { + IRELE(qi->qi_pquotaip); + qi->qi_pquotaip = NULL; + } + mutex_destroy(&qi->qi_quotaofflock); + kmem_free(qi); + mp->m_quotainfo = NULL; +} + +/* + * Create an inode and return with a reference already taken, but unlocked + * This is how we create quota inodes + */ +STATIC int +xfs_qm_qino_alloc( + xfs_mount_t *mp, + xfs_inode_t **ip, + uint flags) +{ + xfs_trans_t *tp; + int error; + int committed; + bool need_alloc = true; + + *ip = NULL; + /* + * With superblock that doesn't have separate pquotino, we + * share an inode between gquota and pquota. If the on-disk + * superblock has GQUOTA and the filesystem is now mounted + * with PQUOTA, just use sb_gquotino for sb_pquotino and + * vice-versa. + */ + if (!xfs_sb_version_has_pquotino(&mp->m_sb) && + (flags & (XFS_QMOPT_PQUOTA|XFS_QMOPT_GQUOTA))) { + xfs_ino_t ino = NULLFSINO; + + if ((flags & XFS_QMOPT_PQUOTA) && + (mp->m_sb.sb_gquotino != NULLFSINO)) { + ino = mp->m_sb.sb_gquotino; + ASSERT(mp->m_sb.sb_pquotino == NULLFSINO); + } else if ((flags & XFS_QMOPT_GQUOTA) && + (mp->m_sb.sb_pquotino != NULLFSINO)) { + ino = mp->m_sb.sb_pquotino; + ASSERT(mp->m_sb.sb_gquotino == NULLFSINO); + } + if (ino != NULLFSINO) { + error = xfs_iget(mp, NULL, ino, 0, 0, ip); + if (error) + return error; + mp->m_sb.sb_gquotino = NULLFSINO; + mp->m_sb.sb_pquotino = NULLFSINO; + need_alloc = false; + } + } + + tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QINOCREATE); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_create, + XFS_QM_QINOCREATE_SPACE_RES(mp), 0); + if (error) { + xfs_trans_cancel(tp, 0); + return error; + } + + if (need_alloc) { + error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, 1, ip, + &committed); + if (error) { + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | + XFS_TRANS_ABORT); + return error; + } + } + + /* + * Make the changes in the superblock, and log those too. + * sbfields arg may contain fields other than *QUOTINO; + * VERSIONNUM for example. + */ + spin_lock(&mp->m_sb_lock); + if (flags & XFS_QMOPT_SBVERSION) { + ASSERT(!xfs_sb_version_hasquota(&mp->m_sb)); + + xfs_sb_version_addquota(&mp->m_sb); + mp->m_sb.sb_uquotino = NULLFSINO; + mp->m_sb.sb_gquotino = NULLFSINO; + mp->m_sb.sb_pquotino = NULLFSINO; + + /* qflags will get updated fully _after_ quotacheck */ + mp->m_sb.sb_qflags = mp->m_qflags & XFS_ALL_QUOTA_ACCT; + } + if (flags & XFS_QMOPT_UQUOTA) + mp->m_sb.sb_uquotino = (*ip)->i_ino; + else if (flags & XFS_QMOPT_GQUOTA) + mp->m_sb.sb_gquotino = (*ip)->i_ino; + else + mp->m_sb.sb_pquotino = (*ip)->i_ino; + spin_unlock(&mp->m_sb_lock); + xfs_log_sb(tp); + + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + if (error) { + ASSERT(XFS_FORCED_SHUTDOWN(mp)); + xfs_alert(mp, "%s failed (error %d)!", __func__, error); + } + if (need_alloc) + xfs_finish_inode_setup(*ip); + return error; +} + + +STATIC void +xfs_qm_reset_dqcounts( + xfs_mount_t *mp, + xfs_buf_t *bp, + xfs_dqid_t id, + uint type) +{ + struct xfs_dqblk *dqb; + int j; + + trace_xfs_reset_dqcounts(bp, _RET_IP_); + + /* + * Reset all counters and timers. They'll be + * started afresh by xfs_qm_quotacheck. + */ +#ifdef DEBUG + j = XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB); + do_div(j, sizeof(xfs_dqblk_t)); + ASSERT(mp->m_quotainfo->qi_dqperchunk == j); +#endif + dqb = bp->b_addr; + for (j = 0; j < mp->m_quotainfo->qi_dqperchunk; j++) { + struct xfs_disk_dquot *ddq; + + ddq = (struct xfs_disk_dquot *)&dqb[j]; + + /* + * Do a sanity check, and if needed, repair the dqblk. Don't + * output any warnings because it's perfectly possible to + * find uninitialised dquot blks. See comment in xfs_dqcheck. + */ + xfs_dqcheck(mp, ddq, id+j, type, XFS_QMOPT_DQREPAIR, + "xfs_quotacheck"); + /* + * Reset type in case we are reusing group quota file for + * project quotas or vice versa + */ + ddq->d_flags = type; + ddq->d_bcount = 0; + ddq->d_icount = 0; + ddq->d_rtbcount = 0; + ddq->d_btimer = 0; + ddq->d_itimer = 0; + ddq->d_rtbtimer = 0; + ddq->d_bwarns = 0; + ddq->d_iwarns = 0; + ddq->d_rtbwarns = 0; + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + xfs_update_cksum((char *)&dqb[j], + sizeof(struct xfs_dqblk), + XFS_DQUOT_CRC_OFF); + } + } +} + +STATIC int +xfs_qm_dqiter_bufs( + struct xfs_mount *mp, + xfs_dqid_t firstid, + xfs_fsblock_t bno, + xfs_filblks_t blkcnt, + uint flags, + struct list_head *buffer_list) +{ + struct xfs_buf *bp; + int error; + int type; + + ASSERT(blkcnt > 0); + type = flags & XFS_QMOPT_UQUOTA ? XFS_DQ_USER : + (flags & XFS_QMOPT_PQUOTA ? XFS_DQ_PROJ : XFS_DQ_GROUP); + error = 0; + + /* + * Blkcnt arg can be a very big number, and might even be + * larger than the log itself. So, we have to break it up into + * manageable-sized transactions. + * Note that we don't start a permanent transaction here; we might + * not be able to get a log reservation for the whole thing up front, + * and we don't really care to either, because we just discard + * everything if we were to crash in the middle of this loop. + */ + while (blkcnt--) { + error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, + XFS_FSB_TO_DADDR(mp, bno), + mp->m_quotainfo->qi_dqchunklen, 0, &bp, + &xfs_dquot_buf_ops); + + /* + * CRC and validation errors will return a EFSCORRUPTED here. If + * this occurs, re-read without CRC validation so that we can + * repair the damage via xfs_qm_reset_dqcounts(). This process + * will leave a trace in the log indicating corruption has + * been detected. + */ + if (error == -EFSCORRUPTED) { + error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, + XFS_FSB_TO_DADDR(mp, bno), + mp->m_quotainfo->qi_dqchunklen, 0, &bp, + NULL); + } + + if (error) + break; + + /* + * A corrupt buffer might not have a verifier attached, so + * make sure we have the correct one attached before writeback + * occurs. + */ + bp->b_ops = &xfs_dquot_buf_ops; + xfs_qm_reset_dqcounts(mp, bp, firstid, type); + xfs_buf_delwri_queue(bp, buffer_list); + xfs_buf_relse(bp); + + /* goto the next block. */ + bno++; + firstid += mp->m_quotainfo->qi_dqperchunk; + } + + return error; +} + +/* + * Iterate over all allocated USR/GRP/PRJ dquots in the system, calling a + * caller supplied function for every chunk of dquots that we find. + */ +STATIC int +xfs_qm_dqiterate( + struct xfs_mount *mp, + struct xfs_inode *qip, + uint flags, + struct list_head *buffer_list) +{ + struct xfs_bmbt_irec *map; + int i, nmaps; /* number of map entries */ + int error; /* return value */ + xfs_fileoff_t lblkno; + xfs_filblks_t maxlblkcnt; + xfs_dqid_t firstid; + xfs_fsblock_t rablkno; + xfs_filblks_t rablkcnt; + + error = 0; + /* + * This looks racy, but we can't keep an inode lock across a + * trans_reserve. But, this gets called during quotacheck, and that + * happens only at mount time which is single threaded. + */ + if (qip->i_d.di_nblocks == 0) + return 0; + + map = kmem_alloc(XFS_DQITER_MAP_SIZE * sizeof(*map), KM_SLEEP); + + lblkno = 0; + maxlblkcnt = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); + do { + uint lock_mode; + + nmaps = XFS_DQITER_MAP_SIZE; + /* + * We aren't changing the inode itself. Just changing + * some of its data. No new blocks are added here, and + * the inode is never added to the transaction. + */ + lock_mode = xfs_ilock_data_map_shared(qip); + error = xfs_bmapi_read(qip, lblkno, maxlblkcnt - lblkno, + map, &nmaps, 0); + xfs_iunlock(qip, lock_mode); + if (error) + break; + + ASSERT(nmaps <= XFS_DQITER_MAP_SIZE); + for (i = 0; i < nmaps; i++) { + ASSERT(map[i].br_startblock != DELAYSTARTBLOCK); + ASSERT(map[i].br_blockcount); + + + lblkno += map[i].br_blockcount; + + if (map[i].br_startblock == HOLESTARTBLOCK) + continue; + + firstid = (xfs_dqid_t) map[i].br_startoff * + mp->m_quotainfo->qi_dqperchunk; + /* + * Do a read-ahead on the next extent. + */ + if ((i+1 < nmaps) && + (map[i+1].br_startblock != HOLESTARTBLOCK)) { + rablkcnt = map[i+1].br_blockcount; + rablkno = map[i+1].br_startblock; + while (rablkcnt--) { + xfs_buf_readahead(mp->m_ddev_targp, + XFS_FSB_TO_DADDR(mp, rablkno), + mp->m_quotainfo->qi_dqchunklen, + &xfs_dquot_buf_ops); + rablkno++; + } + } + /* + * Iterate thru all the blks in the extent and + * reset the counters of all the dquots inside them. + */ + error = xfs_qm_dqiter_bufs(mp, firstid, + map[i].br_startblock, + map[i].br_blockcount, + flags, buffer_list); + if (error) + goto out; + } + } while (nmaps > 0); + +out: + kmem_free(map); + return error; +} + +/* + * Called by dqusage_adjust in doing a quotacheck. + * + * Given the inode, and a dquot id this updates both the incore dqout as well + * as the buffer copy. This is so that once the quotacheck is done, we can + * just log all the buffers, as opposed to logging numerous updates to + * individual dquots. + */ +STATIC int +xfs_qm_quotacheck_dqadjust( + struct xfs_inode *ip, + xfs_dqid_t id, + uint type, + xfs_qcnt_t nblks, + xfs_qcnt_t rtblks) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_dquot *dqp; + int error; + + error = xfs_qm_dqget(mp, ip, id, type, + XFS_QMOPT_DQALLOC | XFS_QMOPT_DOWARN, &dqp); + if (error) { + /* + * Shouldn't be able to turn off quotas here. + */ + ASSERT(error != -ESRCH); + ASSERT(error != -ENOENT); + return error; + } + + trace_xfs_dqadjust(dqp); + + /* + * Adjust the inode count and the block count to reflect this inode's + * resource usage. + */ + be64_add_cpu(&dqp->q_core.d_icount, 1); + dqp->q_res_icount++; + if (nblks) { + be64_add_cpu(&dqp->q_core.d_bcount, nblks); + dqp->q_res_bcount += nblks; + } + if (rtblks) { + be64_add_cpu(&dqp->q_core.d_rtbcount, rtblks); + dqp->q_res_rtbcount += rtblks; + } + + /* + * Set default limits, adjust timers (since we changed usages) + * + * There are no timers for the default values set in the root dquot. + */ + if (dqp->q_core.d_id) { + xfs_qm_adjust_dqlimits(mp, dqp); + xfs_qm_adjust_dqtimers(mp, &dqp->q_core); + } + + dqp->dq_flags |= XFS_DQ_DIRTY; + xfs_qm_dqput(dqp); + return 0; +} + +STATIC int +xfs_qm_get_rtblks( + xfs_inode_t *ip, + xfs_qcnt_t *O_rtblks) +{ + xfs_filblks_t rtblks; /* total rt blks */ + xfs_extnum_t idx; /* extent record index */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_extnum_t nextents; /* number of extent entries */ + int error; + + ASSERT(XFS_IS_REALTIME_INODE(ip)); + ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + if ((error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK))) + return error; + } + rtblks = 0; + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + for (idx = 0; idx < nextents; idx++) + rtblks += xfs_bmbt_get_blockcount(xfs_iext_get_ext(ifp, idx)); + *O_rtblks = (xfs_qcnt_t)rtblks; + return 0; +} + +/* + * callback routine supplied to bulkstat(). Given an inumber, find its + * dquots and update them to account for resources taken by that inode. + */ +/* ARGSUSED */ +STATIC int +xfs_qm_dqusage_adjust( + xfs_mount_t *mp, /* mount point for filesystem */ + xfs_ino_t ino, /* inode number to get data for */ + void __user *buffer, /* not used */ + int ubsize, /* not used */ + int *ubused, /* not used */ + int *res) /* result code value */ +{ + xfs_inode_t *ip; + xfs_qcnt_t nblks, rtblks = 0; + int error; + + ASSERT(XFS_IS_QUOTA_RUNNING(mp)); + + /* + * rootino must have its resources accounted for, not so with the quota + * inodes. + */ + if (xfs_is_quota_inode(&mp->m_sb, ino)) { + *res = BULKSTAT_RV_NOTHING; + return -EINVAL; + } + + /* + * We don't _need_ to take the ilock EXCL. However, the xfs_qm_dqget + * interface expects the inode to be exclusively locked because that's + * the case in all other instances. It's OK that we do this because + * quotacheck is done only at mount time. + */ + error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_EXCL, &ip); + if (error) { + *res = BULKSTAT_RV_NOTHING; + return error; + } + + ASSERT(ip->i_delayed_blks == 0); + + if (XFS_IS_REALTIME_INODE(ip)) { + /* + * Walk thru the extent list and count the realtime blocks. + */ + error = xfs_qm_get_rtblks(ip, &rtblks); + if (error) + goto error0; + } + + nblks = (xfs_qcnt_t)ip->i_d.di_nblocks - rtblks; + + /* + * Add the (disk blocks and inode) resources occupied by this + * inode to its dquots. We do this adjustment in the incore dquot, + * and also copy the changes to its buffer. + * We don't care about putting these changes in a transaction + * envelope because if we crash in the middle of a 'quotacheck' + * we have to start from the beginning anyway. + * Once we're done, we'll log all the dquot bufs. + * + * The *QUOTA_ON checks below may look pretty racy, but quotachecks + * and quotaoffs don't race. (Quotachecks happen at mount time only). + */ + if (XFS_IS_UQUOTA_ON(mp)) { + error = xfs_qm_quotacheck_dqadjust(ip, ip->i_d.di_uid, + XFS_DQ_USER, nblks, rtblks); + if (error) + goto error0; + } + + if (XFS_IS_GQUOTA_ON(mp)) { + error = xfs_qm_quotacheck_dqadjust(ip, ip->i_d.di_gid, + XFS_DQ_GROUP, nblks, rtblks); + if (error) + goto error0; + } + + if (XFS_IS_PQUOTA_ON(mp)) { + error = xfs_qm_quotacheck_dqadjust(ip, xfs_get_projid(ip), + XFS_DQ_PROJ, nblks, rtblks); + if (error) + goto error0; + } + + xfs_iunlock(ip, XFS_ILOCK_EXCL); + IRELE(ip); + *res = BULKSTAT_RV_DIDONE; + return 0; + +error0: + xfs_iunlock(ip, XFS_ILOCK_EXCL); + IRELE(ip); + *res = BULKSTAT_RV_GIVEUP; + return error; +} + +STATIC int +xfs_qm_flush_one( + struct xfs_dquot *dqp, + void *data) +{ + struct list_head *buffer_list = data; + struct xfs_buf *bp = NULL; + int error = 0; + + xfs_dqlock(dqp); + if (dqp->dq_flags & XFS_DQ_FREEING) + goto out_unlock; + if (!XFS_DQ_IS_DIRTY(dqp)) + goto out_unlock; + + xfs_dqflock(dqp); + error = xfs_qm_dqflush(dqp, &bp); + if (error) + goto out_unlock; + + xfs_buf_delwri_queue(bp, buffer_list); + xfs_buf_relse(bp); +out_unlock: + xfs_dqunlock(dqp); + return error; +} + +/* + * Walk thru all the filesystem inodes and construct a consistent view + * of the disk quota world. If the quotacheck fails, disable quotas. + */ +STATIC int +xfs_qm_quotacheck( + xfs_mount_t *mp) +{ + int done, count, error, error2; + xfs_ino_t lastino; + size_t structsz; + uint flags; + LIST_HEAD (buffer_list); + struct xfs_inode *uip = mp->m_quotainfo->qi_uquotaip; + struct xfs_inode *gip = mp->m_quotainfo->qi_gquotaip; + struct xfs_inode *pip = mp->m_quotainfo->qi_pquotaip; + + count = INT_MAX; + structsz = 1; + lastino = 0; + flags = 0; + + ASSERT(uip || gip || pip); + ASSERT(XFS_IS_QUOTA_RUNNING(mp)); + + xfs_notice(mp, "Quotacheck needed: Please wait."); + + /* + * First we go thru all the dquots on disk, USR and GRP/PRJ, and reset + * their counters to zero. We need a clean slate. + * We don't log our changes till later. + */ + if (uip) { + error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA, + &buffer_list); + if (error) + goto error_return; + flags |= XFS_UQUOTA_CHKD; + } + + if (gip) { + error = xfs_qm_dqiterate(mp, gip, XFS_QMOPT_GQUOTA, + &buffer_list); + if (error) + goto error_return; + flags |= XFS_GQUOTA_CHKD; + } + + if (pip) { + error = xfs_qm_dqiterate(mp, pip, XFS_QMOPT_PQUOTA, + &buffer_list); + if (error) + goto error_return; + flags |= XFS_PQUOTA_CHKD; + } + + do { + /* + * Iterate thru all the inodes in the file system, + * adjusting the corresponding dquot counters in core. + */ + error = xfs_bulkstat(mp, &lastino, &count, + xfs_qm_dqusage_adjust, + structsz, NULL, &done); + if (error) + break; + + } while (!done); + + /* + * We've made all the changes that we need to make incore. Flush them + * down to disk buffers if everything was updated successfully. + */ + if (XFS_IS_UQUOTA_ON(mp)) { + error = xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_flush_one, + &buffer_list); + } + if (XFS_IS_GQUOTA_ON(mp)) { + error2 = xfs_qm_dquot_walk(mp, XFS_DQ_GROUP, xfs_qm_flush_one, + &buffer_list); + if (!error) + error = error2; + } + if (XFS_IS_PQUOTA_ON(mp)) { + error2 = xfs_qm_dquot_walk(mp, XFS_DQ_PROJ, xfs_qm_flush_one, + &buffer_list); + if (!error) + error = error2; + } + + error2 = xfs_buf_delwri_submit(&buffer_list); + if (!error) + error = error2; + + /* + * We can get this error if we couldn't do a dquot allocation inside + * xfs_qm_dqusage_adjust (via bulkstat). We don't care about the + * dirty dquots that might be cached, we just want to get rid of them + * and turn quotaoff. The dquots won't be attached to any of the inodes + * at this point (because we intentionally didn't in dqget_noattach). + */ + if (error) { + xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL); + goto error_return; + } + + /* + * If one type of quotas is off, then it will lose its + * quotachecked status, since we won't be doing accounting for + * that type anymore. + */ + mp->m_qflags &= ~XFS_ALL_QUOTA_CHKD; + mp->m_qflags |= flags; + + error_return: + while (!list_empty(&buffer_list)) { + struct xfs_buf *bp = + list_first_entry(&buffer_list, struct xfs_buf, b_list); + list_del_init(&bp->b_list); + xfs_buf_relse(bp); + } + + if (error) { + xfs_warn(mp, + "Quotacheck: Unsuccessful (Error %d): Disabling quotas.", + error); + /* + * We must turn off quotas. + */ + ASSERT(mp->m_quotainfo != NULL); + xfs_qm_destroy_quotainfo(mp); + if (xfs_mount_reset_sbqflags(mp)) { + xfs_warn(mp, + "Quotacheck: Failed to reset quota flags."); + } + } else + xfs_notice(mp, "Quotacheck: Done."); + return error; +} + +/* + * This is called from xfs_mountfs to start quotas and initialize all + * necessary data structures like quotainfo. This is also responsible for + * running a quotacheck as necessary. We are guaranteed that the superblock + * is consistently read in at this point. + * + * If we fail here, the mount will continue with quota turned off. We don't + * need to inidicate success or failure at all. + */ +void +xfs_qm_mount_quotas( + struct xfs_mount *mp) +{ + int error = 0; + uint sbf; + + /* + * If quotas on realtime volumes is not supported, we disable + * quotas immediately. + */ + if (mp->m_sb.sb_rextents) { + xfs_notice(mp, "Cannot turn on quotas for realtime filesystem"); + mp->m_qflags = 0; + goto write_changes; + } + + ASSERT(XFS_IS_QUOTA_RUNNING(mp)); + + /* + * Allocate the quotainfo structure inside the mount struct, and + * create quotainode(s), and change/rev superblock if necessary. + */ + error = xfs_qm_init_quotainfo(mp); + if (error) { + /* + * We must turn off quotas. + */ + ASSERT(mp->m_quotainfo == NULL); + mp->m_qflags = 0; + goto write_changes; + } + /* + * If any of the quotas are not consistent, do a quotacheck. + */ + if (XFS_QM_NEED_QUOTACHECK(mp)) { + error = xfs_qm_quotacheck(mp); + if (error) { + /* Quotacheck failed and disabled quotas. */ + return; + } + } + /* + * If one type of quotas is off, then it will lose its + * quotachecked status, since we won't be doing accounting for + * that type anymore. + */ + if (!XFS_IS_UQUOTA_ON(mp)) + mp->m_qflags &= ~XFS_UQUOTA_CHKD; + if (!XFS_IS_GQUOTA_ON(mp)) + mp->m_qflags &= ~XFS_GQUOTA_CHKD; + if (!XFS_IS_PQUOTA_ON(mp)) + mp->m_qflags &= ~XFS_PQUOTA_CHKD; + + write_changes: + /* + * We actually don't have to acquire the m_sb_lock at all. + * This can only be called from mount, and that's single threaded. XXX + */ + spin_lock(&mp->m_sb_lock); + sbf = mp->m_sb.sb_qflags; + mp->m_sb.sb_qflags = mp->m_qflags & XFS_MOUNT_QUOTA_ALL; + spin_unlock(&mp->m_sb_lock); + + if (sbf != (mp->m_qflags & XFS_MOUNT_QUOTA_ALL)) { + if (xfs_sync_sb(mp, false)) { + /* + * We could only have been turning quotas off. + * We aren't in very good shape actually because + * the incore structures are convinced that quotas are + * off, but the on disk superblock doesn't know that ! + */ + ASSERT(!(XFS_IS_QUOTA_RUNNING(mp))); + xfs_alert(mp, "%s: Superblock update failed!", + __func__); + } + } + + if (error) { + xfs_warn(mp, "Failed to initialize disk quotas."); + return; + } +} + +/* + * This is called after the superblock has been read in and we're ready to + * iget the quota inodes. + */ +STATIC int +xfs_qm_init_quotainos( + xfs_mount_t *mp) +{ + struct xfs_inode *uip = NULL; + struct xfs_inode *gip = NULL; + struct xfs_inode *pip = NULL; + int error; + uint flags = 0; + + ASSERT(mp->m_quotainfo); + + /* + * Get the uquota and gquota inodes + */ + if (xfs_sb_version_hasquota(&mp->m_sb)) { + if (XFS_IS_UQUOTA_ON(mp) && + mp->m_sb.sb_uquotino != NULLFSINO) { + ASSERT(mp->m_sb.sb_uquotino > 0); + error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, + 0, 0, &uip); + if (error) + return error; + } + if (XFS_IS_GQUOTA_ON(mp) && + mp->m_sb.sb_gquotino != NULLFSINO) { + ASSERT(mp->m_sb.sb_gquotino > 0); + error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, + 0, 0, &gip); + if (error) + goto error_rele; + } + if (XFS_IS_PQUOTA_ON(mp) && + mp->m_sb.sb_pquotino != NULLFSINO) { + ASSERT(mp->m_sb.sb_pquotino > 0); + error = xfs_iget(mp, NULL, mp->m_sb.sb_pquotino, + 0, 0, &pip); + if (error) + goto error_rele; + } + } else { + flags |= XFS_QMOPT_SBVERSION; + } + + /* + * Create the three inodes, if they don't exist already. The changes + * made above will get added to a transaction and logged in one of + * the qino_alloc calls below. If the device is readonly, + * temporarily switch to read-write to do this. + */ + if (XFS_IS_UQUOTA_ON(mp) && uip == NULL) { + error = xfs_qm_qino_alloc(mp, &uip, + flags | XFS_QMOPT_UQUOTA); + if (error) + goto error_rele; + + flags &= ~XFS_QMOPT_SBVERSION; + } + if (XFS_IS_GQUOTA_ON(mp) && gip == NULL) { + error = xfs_qm_qino_alloc(mp, &gip, + flags | XFS_QMOPT_GQUOTA); + if (error) + goto error_rele; + + flags &= ~XFS_QMOPT_SBVERSION; + } + if (XFS_IS_PQUOTA_ON(mp) && pip == NULL) { + error = xfs_qm_qino_alloc(mp, &pip, + flags | XFS_QMOPT_PQUOTA); + if (error) + goto error_rele; + } + + mp->m_quotainfo->qi_uquotaip = uip; + mp->m_quotainfo->qi_gquotaip = gip; + mp->m_quotainfo->qi_pquotaip = pip; + + return 0; + +error_rele: + if (uip) + IRELE(uip); + if (gip) + IRELE(gip); + if (pip) + IRELE(pip); + return error; +} + +STATIC void +xfs_qm_dqfree_one( + struct xfs_dquot *dqp) +{ + struct xfs_mount *mp = dqp->q_mount; + struct xfs_quotainfo *qi = mp->m_quotainfo; + + mutex_lock(&qi->qi_tree_lock); + radix_tree_delete(xfs_dquot_tree(qi, dqp->q_core.d_flags), + be32_to_cpu(dqp->q_core.d_id)); + + qi->qi_dquots--; + mutex_unlock(&qi->qi_tree_lock); + + xfs_qm_dqdestroy(dqp); +} + +/* --------------- utility functions for vnodeops ---------------- */ + + +/* + * Given an inode, a uid, gid and prid make sure that we have + * allocated relevant dquot(s) on disk, and that we won't exceed inode + * quotas by creating this file. + * This also attaches dquot(s) to the given inode after locking it, + * and returns the dquots corresponding to the uid and/or gid. + * + * in : inode (unlocked) + * out : udquot, gdquot with references taken and unlocked + */ +int +xfs_qm_vop_dqalloc( + struct xfs_inode *ip, + xfs_dqid_t uid, + xfs_dqid_t gid, + prid_t prid, + uint flags, + struct xfs_dquot **O_udqpp, + struct xfs_dquot **O_gdqpp, + struct xfs_dquot **O_pdqpp) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_dquot *uq = NULL; + struct xfs_dquot *gq = NULL; + struct xfs_dquot *pq = NULL; + int error; + uint lockflags; + + if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp)) + return 0; + + lockflags = XFS_ILOCK_EXCL; + xfs_ilock(ip, lockflags); + + if ((flags & XFS_QMOPT_INHERIT) && XFS_INHERIT_GID(ip)) + gid = ip->i_d.di_gid; + + /* + * Attach the dquot(s) to this inode, doing a dquot allocation + * if necessary. The dquot(s) will not be locked. + */ + if (XFS_NOT_DQATTACHED(mp, ip)) { + error = xfs_qm_dqattach_locked(ip, XFS_QMOPT_DQALLOC); + if (error) { + xfs_iunlock(ip, lockflags); + return error; + } + } + + if ((flags & XFS_QMOPT_UQUOTA) && XFS_IS_UQUOTA_ON(mp)) { + if (ip->i_d.di_uid != uid) { + /* + * What we need is the dquot that has this uid, and + * if we send the inode to dqget, the uid of the inode + * takes priority over what's sent in the uid argument. + * We must unlock inode here before calling dqget if + * we're not sending the inode, because otherwise + * we'll deadlock by doing trans_reserve while + * holding ilock. + */ + xfs_iunlock(ip, lockflags); + error = xfs_qm_dqget(mp, NULL, uid, + XFS_DQ_USER, + XFS_QMOPT_DQALLOC | + XFS_QMOPT_DOWARN, + &uq); + if (error) { + ASSERT(error != -ENOENT); + return error; + } + /* + * Get the ilock in the right order. + */ + xfs_dqunlock(uq); + lockflags = XFS_ILOCK_SHARED; + xfs_ilock(ip, lockflags); + } else { + /* + * Take an extra reference, because we'll return + * this to caller + */ + ASSERT(ip->i_udquot); + uq = xfs_qm_dqhold(ip->i_udquot); + } + } + if ((flags & XFS_QMOPT_GQUOTA) && XFS_IS_GQUOTA_ON(mp)) { + if (ip->i_d.di_gid != gid) { + xfs_iunlock(ip, lockflags); + error = xfs_qm_dqget(mp, NULL, gid, + XFS_DQ_GROUP, + XFS_QMOPT_DQALLOC | + XFS_QMOPT_DOWARN, + &gq); + if (error) { + ASSERT(error != -ENOENT); + goto error_rele; + } + xfs_dqunlock(gq); + lockflags = XFS_ILOCK_SHARED; + xfs_ilock(ip, lockflags); + } else { + ASSERT(ip->i_gdquot); + gq = xfs_qm_dqhold(ip->i_gdquot); + } + } + if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) { + if (xfs_get_projid(ip) != prid) { + xfs_iunlock(ip, lockflags); + error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)prid, + XFS_DQ_PROJ, + XFS_QMOPT_DQALLOC | + XFS_QMOPT_DOWARN, + &pq); + if (error) { + ASSERT(error != -ENOENT); + goto error_rele; + } + xfs_dqunlock(pq); + lockflags = XFS_ILOCK_SHARED; + xfs_ilock(ip, lockflags); + } else { + ASSERT(ip->i_pdquot); + pq = xfs_qm_dqhold(ip->i_pdquot); + } + } + if (uq) + trace_xfs_dquot_dqalloc(ip); + + xfs_iunlock(ip, lockflags); + if (O_udqpp) + *O_udqpp = uq; + else + xfs_qm_dqrele(uq); + if (O_gdqpp) + *O_gdqpp = gq; + else + xfs_qm_dqrele(gq); + if (O_pdqpp) + *O_pdqpp = pq; + else + xfs_qm_dqrele(pq); + return 0; + +error_rele: + xfs_qm_dqrele(gq); + xfs_qm_dqrele(uq); + return error; +} + +/* + * Actually transfer ownership, and do dquot modifications. + * These were already reserved. + */ +xfs_dquot_t * +xfs_qm_vop_chown( + xfs_trans_t *tp, + xfs_inode_t *ip, + xfs_dquot_t **IO_olddq, + xfs_dquot_t *newdq) +{ + xfs_dquot_t *prevdq; + uint bfield = XFS_IS_REALTIME_INODE(ip) ? + XFS_TRANS_DQ_RTBCOUNT : XFS_TRANS_DQ_BCOUNT; + + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + ASSERT(XFS_IS_QUOTA_RUNNING(ip->i_mount)); + + /* old dquot */ + prevdq = *IO_olddq; + ASSERT(prevdq); + ASSERT(prevdq != newdq); + + xfs_trans_mod_dquot(tp, prevdq, bfield, -(ip->i_d.di_nblocks)); + xfs_trans_mod_dquot(tp, prevdq, XFS_TRANS_DQ_ICOUNT, -1); + + /* the sparkling new dquot */ + xfs_trans_mod_dquot(tp, newdq, bfield, ip->i_d.di_nblocks); + xfs_trans_mod_dquot(tp, newdq, XFS_TRANS_DQ_ICOUNT, 1); + + /* + * Take an extra reference, because the inode is going to keep + * this dquot pointer even after the trans_commit. + */ + *IO_olddq = xfs_qm_dqhold(newdq); + + return prevdq; +} + +/* + * Quota reservations for setattr(AT_UID|AT_GID|AT_PROJID). + */ +int +xfs_qm_vop_chown_reserve( + struct xfs_trans *tp, + struct xfs_inode *ip, + struct xfs_dquot *udqp, + struct xfs_dquot *gdqp, + struct xfs_dquot *pdqp, + uint flags) +{ + struct xfs_mount *mp = ip->i_mount; + uint delblks, blkflags, prjflags = 0; + struct xfs_dquot *udq_unres = NULL; + struct xfs_dquot *gdq_unres = NULL; + struct xfs_dquot *pdq_unres = NULL; + struct xfs_dquot *udq_delblks = NULL; + struct xfs_dquot *gdq_delblks = NULL; + struct xfs_dquot *pdq_delblks = NULL; + int error; + + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); + ASSERT(XFS_IS_QUOTA_RUNNING(mp)); + + delblks = ip->i_delayed_blks; + blkflags = XFS_IS_REALTIME_INODE(ip) ? + XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS; + + if (XFS_IS_UQUOTA_ON(mp) && udqp && + ip->i_d.di_uid != be32_to_cpu(udqp->q_core.d_id)) { + udq_delblks = udqp; + /* + * If there are delayed allocation blocks, then we have to + * unreserve those from the old dquot, and add them to the + * new dquot. + */ + if (delblks) { + ASSERT(ip->i_udquot); + udq_unres = ip->i_udquot; + } + } + if (XFS_IS_GQUOTA_ON(ip->i_mount) && gdqp && + ip->i_d.di_gid != be32_to_cpu(gdqp->q_core.d_id)) { + gdq_delblks = gdqp; + if (delblks) { + ASSERT(ip->i_gdquot); + gdq_unres = ip->i_gdquot; + } + } + + if (XFS_IS_PQUOTA_ON(ip->i_mount) && pdqp && + xfs_get_projid(ip) != be32_to_cpu(pdqp->q_core.d_id)) { + prjflags = XFS_QMOPT_ENOSPC; + pdq_delblks = pdqp; + if (delblks) { + ASSERT(ip->i_pdquot); + pdq_unres = ip->i_pdquot; + } + } + + error = xfs_trans_reserve_quota_bydquots(tp, ip->i_mount, + udq_delblks, gdq_delblks, pdq_delblks, + ip->i_d.di_nblocks, 1, + flags | blkflags | prjflags); + if (error) + return error; + + /* + * Do the delayed blks reservations/unreservations now. Since, these + * are done without the help of a transaction, if a reservation fails + * its previous reservations won't be automatically undone by trans + * code. So, we have to do it manually here. + */ + if (delblks) { + /* + * Do the reservations first. Unreservation can't fail. + */ + ASSERT(udq_delblks || gdq_delblks || pdq_delblks); + ASSERT(udq_unres || gdq_unres || pdq_unres); + error = xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount, + udq_delblks, gdq_delblks, pdq_delblks, + (xfs_qcnt_t)delblks, 0, + flags | blkflags | prjflags); + if (error) + return error; + xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount, + udq_unres, gdq_unres, pdq_unres, + -((xfs_qcnt_t)delblks), 0, blkflags); + } + + return 0; +} + +int +xfs_qm_vop_rename_dqattach( + struct xfs_inode **i_tab) +{ + struct xfs_mount *mp = i_tab[0]->i_mount; + int i; + + if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp)) + return 0; + + for (i = 0; (i < 4 && i_tab[i]); i++) { + struct xfs_inode *ip = i_tab[i]; + int error; + + /* + * Watch out for duplicate entries in the table. + */ + if (i == 0 || ip != i_tab[i-1]) { + if (XFS_NOT_DQATTACHED(mp, ip)) { + error = xfs_qm_dqattach(ip, 0); + if (error) + return error; + } + } + } + return 0; +} + +void +xfs_qm_vop_create_dqattach( + struct xfs_trans *tp, + struct xfs_inode *ip, + struct xfs_dquot *udqp, + struct xfs_dquot *gdqp, + struct xfs_dquot *pdqp) +{ + struct xfs_mount *mp = tp->t_mountp; + + if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp)) + return; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + ASSERT(XFS_IS_QUOTA_RUNNING(mp)); + + if (udqp && XFS_IS_UQUOTA_ON(mp)) { + ASSERT(ip->i_udquot == NULL); + ASSERT(ip->i_d.di_uid == be32_to_cpu(udqp->q_core.d_id)); + + ip->i_udquot = xfs_qm_dqhold(udqp); + xfs_trans_mod_dquot(tp, udqp, XFS_TRANS_DQ_ICOUNT, 1); + } + if (gdqp && XFS_IS_GQUOTA_ON(mp)) { + ASSERT(ip->i_gdquot == NULL); + ASSERT(ip->i_d.di_gid == be32_to_cpu(gdqp->q_core.d_id)); + ip->i_gdquot = xfs_qm_dqhold(gdqp); + xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1); + } + if (pdqp && XFS_IS_PQUOTA_ON(mp)) { + ASSERT(ip->i_pdquot == NULL); + ASSERT(xfs_get_projid(ip) == be32_to_cpu(pdqp->q_core.d_id)); + + ip->i_pdquot = xfs_qm_dqhold(pdqp); + xfs_trans_mod_dquot(tp, pdqp, XFS_TRANS_DQ_ICOUNT, 1); + } +} + diff --git a/kernel/fs/xfs/xfs_qm.h b/kernel/fs/xfs/xfs_qm.h new file mode 100644 index 000000000..996a04064 --- /dev/null +++ b/kernel/fs/xfs/xfs_qm.h @@ -0,0 +1,174 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_QM_H__ +#define __XFS_QM_H__ + +#include "xfs_dquot_item.h" +#include "xfs_dquot.h" + +struct xfs_inode; + +extern struct kmem_zone *xfs_qm_dqtrxzone; + +/* + * Number of bmaps that we ask from bmapi when doing a quotacheck. + * We make this restriction to keep the memory usage to a minimum. + */ +#define XFS_DQITER_MAP_SIZE 10 + +#define XFS_IS_DQUOT_UNINITIALIZED(dqp) ( \ + !dqp->q_core.d_blk_hardlimit && \ + !dqp->q_core.d_blk_softlimit && \ + !dqp->q_core.d_rtb_hardlimit && \ + !dqp->q_core.d_rtb_softlimit && \ + !dqp->q_core.d_ino_hardlimit && \ + !dqp->q_core.d_ino_softlimit && \ + !dqp->q_core.d_bcount && \ + !dqp->q_core.d_rtbcount && \ + !dqp->q_core.d_icount) + +/* + * This defines the unit of allocation of dquots. + * Currently, it is just one file system block, and a 4K blk contains 30 + * (136 * 30 = 4080) dquots. It's probably not worth trying to make + * this more dynamic. + * XXXsup However, if this number is changed, we have to make sure that we don't + * implicitly assume that we do allocations in chunks of a single filesystem + * block in the dquot/xqm code. + */ +#define XFS_DQUOT_CLUSTER_SIZE_FSB (xfs_filblks_t)1 + +/* + * Various quota information for individual filesystems. + * The mount structure keeps a pointer to this. + */ +typedef struct xfs_quotainfo { + struct radix_tree_root qi_uquota_tree; + struct radix_tree_root qi_gquota_tree; + struct radix_tree_root qi_pquota_tree; + struct mutex qi_tree_lock; + struct xfs_inode *qi_uquotaip; /* user quota inode */ + struct xfs_inode *qi_gquotaip; /* group quota inode */ + struct xfs_inode *qi_pquotaip; /* project quota inode */ + struct list_lru qi_lru; + int qi_dquots; + time_t qi_btimelimit; /* limit for blks timer */ + time_t qi_itimelimit; /* limit for inodes timer */ + time_t qi_rtbtimelimit;/* limit for rt blks timer */ + xfs_qwarncnt_t qi_bwarnlimit; /* limit for blks warnings */ + xfs_qwarncnt_t qi_iwarnlimit; /* limit for inodes warnings */ + xfs_qwarncnt_t qi_rtbwarnlimit;/* limit for rt blks warnings */ + struct mutex qi_quotaofflock;/* to serialize quotaoff */ + xfs_filblks_t qi_dqchunklen; /* # BBs in a chunk of dqs */ + uint qi_dqperchunk; /* # ondisk dqs in above chunk */ + xfs_qcnt_t qi_bhardlimit; /* default data blk hard limit */ + xfs_qcnt_t qi_bsoftlimit; /* default data blk soft limit */ + xfs_qcnt_t qi_ihardlimit; /* default inode count hard limit */ + xfs_qcnt_t qi_isoftlimit; /* default inode count soft limit */ + xfs_qcnt_t qi_rtbhardlimit;/* default realtime blk hard limit */ + xfs_qcnt_t qi_rtbsoftlimit;/* default realtime blk soft limit */ + struct shrinker qi_shrinker; +} xfs_quotainfo_t; + +static inline struct radix_tree_root * +xfs_dquot_tree( + struct xfs_quotainfo *qi, + int type) +{ + switch (type) { + case XFS_DQ_USER: + return &qi->qi_uquota_tree; + case XFS_DQ_GROUP: + return &qi->qi_gquota_tree; + case XFS_DQ_PROJ: + return &qi->qi_pquota_tree; + default: + ASSERT(0); + } + return NULL; +} + +static inline struct xfs_inode * +xfs_dq_to_quota_inode(struct xfs_dquot *dqp) +{ + switch (dqp->dq_flags & XFS_DQ_ALLTYPES) { + case XFS_DQ_USER: + return dqp->q_mount->m_quotainfo->qi_uquotaip; + case XFS_DQ_GROUP: + return dqp->q_mount->m_quotainfo->qi_gquotaip; + case XFS_DQ_PROJ: + return dqp->q_mount->m_quotainfo->qi_pquotaip; + default: + ASSERT(0); + } + return NULL; +} + +extern void xfs_trans_mod_dquot(struct xfs_trans *, + struct xfs_dquot *, uint, long); +extern int xfs_trans_reserve_quota_bydquots(struct xfs_trans *, + struct xfs_mount *, struct xfs_dquot *, + struct xfs_dquot *, struct xfs_dquot *, + long, long, uint); +extern void xfs_trans_dqjoin(struct xfs_trans *, struct xfs_dquot *); +extern void xfs_trans_log_dquot(struct xfs_trans *, struct xfs_dquot *); + +/* + * We keep the usr, grp, and prj dquots separately so that locking will be + * easier to do at commit time. All transactions that we know of at this point + * affect no more than two dquots of one type. Hence, the TRANS_MAXDQS value. + */ +enum { + XFS_QM_TRANS_USR = 0, + XFS_QM_TRANS_GRP, + XFS_QM_TRANS_PRJ, + XFS_QM_TRANS_DQTYPES +}; +#define XFS_QM_TRANS_MAXDQS 2 +struct xfs_dquot_acct { + struct xfs_dqtrx dqs[XFS_QM_TRANS_DQTYPES][XFS_QM_TRANS_MAXDQS]; +}; + +/* + * Users are allowed to have a usage exceeding their softlimit for + * a period this long. + */ +#define XFS_QM_BTIMELIMIT (7 * 24*60*60) /* 1 week */ +#define XFS_QM_RTBTIMELIMIT (7 * 24*60*60) /* 1 week */ +#define XFS_QM_ITIMELIMIT (7 * 24*60*60) /* 1 week */ + +#define XFS_QM_BWARNLIMIT 5 +#define XFS_QM_IWARNLIMIT 5 +#define XFS_QM_RTBWARNLIMIT 5 + +extern void xfs_qm_destroy_quotainfo(struct xfs_mount *); + +/* dquot stuff */ +extern void xfs_qm_dqpurge_all(struct xfs_mount *, uint); +extern void xfs_qm_dqrele_all_inodes(struct xfs_mount *, uint); + +/* quota ops */ +extern int xfs_qm_scall_trunc_qfiles(struct xfs_mount *, uint); +extern int xfs_qm_scall_getquota(struct xfs_mount *, xfs_dqid_t, + uint, struct qc_dqblk *); +extern int xfs_qm_scall_setqlim(struct xfs_mount *, xfs_dqid_t, uint, + struct qc_dqblk *); +extern int xfs_qm_scall_quotaon(struct xfs_mount *, uint); +extern int xfs_qm_scall_quotaoff(struct xfs_mount *, uint); + +#endif /* __XFS_QM_H__ */ diff --git a/kernel/fs/xfs/xfs_qm_bhv.c b/kernel/fs/xfs/xfs_qm_bhv.c new file mode 100644 index 000000000..3e52d5de7 --- /dev/null +++ b/kernel/fs/xfs/xfs_qm_bhv.c @@ -0,0 +1,149 @@ +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_quota.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_error.h" +#include "xfs_trans.h" +#include "xfs_qm.h" + + +STATIC void +xfs_fill_statvfs_from_dquot( + struct kstatfs *statp, + struct xfs_dquot *dqp) +{ + __uint64_t limit; + + limit = dqp->q_core.d_blk_softlimit ? + be64_to_cpu(dqp->q_core.d_blk_softlimit) : + be64_to_cpu(dqp->q_core.d_blk_hardlimit); + if (limit && statp->f_blocks > limit) { + statp->f_blocks = limit; + statp->f_bfree = statp->f_bavail = + (statp->f_blocks > dqp->q_res_bcount) ? + (statp->f_blocks - dqp->q_res_bcount) : 0; + } + + limit = dqp->q_core.d_ino_softlimit ? + be64_to_cpu(dqp->q_core.d_ino_softlimit) : + be64_to_cpu(dqp->q_core.d_ino_hardlimit); + if (limit && statp->f_files > limit) { + statp->f_files = limit; + statp->f_ffree = + (statp->f_files > dqp->q_res_icount) ? + (statp->f_ffree - dqp->q_res_icount) : 0; + } +} + + +/* + * Directory tree accounting is implemented using project quotas, where + * the project identifier is inherited from parent directories. + * A statvfs (df, etc.) of a directory that is using project quota should + * return a statvfs of the project, not the entire filesystem. + * This makes such trees appear as if they are filesystems in themselves. + */ +void +xfs_qm_statvfs( + xfs_inode_t *ip, + struct kstatfs *statp) +{ + xfs_mount_t *mp = ip->i_mount; + xfs_dquot_t *dqp; + + if (!xfs_qm_dqget(mp, NULL, xfs_get_projid(ip), XFS_DQ_PROJ, 0, &dqp)) { + xfs_fill_statvfs_from_dquot(statp, dqp); + xfs_qm_dqput(dqp); + } +} + +int +xfs_qm_newmount( + xfs_mount_t *mp, + uint *needquotamount, + uint *quotaflags) +{ + uint quotaondisk; + uint uquotaondisk = 0, gquotaondisk = 0, pquotaondisk = 0; + + quotaondisk = xfs_sb_version_hasquota(&mp->m_sb) && + (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_ACCT); + + if (quotaondisk) { + uquotaondisk = mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT; + pquotaondisk = mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT; + gquotaondisk = mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT; + } + + /* + * If the device itself is read-only, we can't allow + * the user to change the state of quota on the mount - + * this would generate a transaction on the ro device, + * which would lead to an I/O error and shutdown + */ + + if (((uquotaondisk && !XFS_IS_UQUOTA_ON(mp)) || + (!uquotaondisk && XFS_IS_UQUOTA_ON(mp)) || + (gquotaondisk && !XFS_IS_GQUOTA_ON(mp)) || + (!gquotaondisk && XFS_IS_GQUOTA_ON(mp)) || + (pquotaondisk && !XFS_IS_PQUOTA_ON(mp)) || + (!pquotaondisk && XFS_IS_PQUOTA_ON(mp))) && + xfs_dev_is_read_only(mp, "changing quota state")) { + xfs_warn(mp, "please mount with%s%s%s%s.", + (!quotaondisk ? "out quota" : ""), + (uquotaondisk ? " usrquota" : ""), + (gquotaondisk ? " grpquota" : ""), + (pquotaondisk ? " prjquota" : "")); + return -EPERM; + } + + if (XFS_IS_QUOTA_ON(mp) || quotaondisk) { + /* + * Call mount_quotas at this point only if we won't have to do + * a quotacheck. + */ + if (quotaondisk && !XFS_QM_NEED_QUOTACHECK(mp)) { + /* + * If an error occurred, qm_mount_quotas code + * has already disabled quotas. So, just finish + * mounting, and get on with the boring life + * without disk quotas. + */ + xfs_qm_mount_quotas(mp); + } else { + /* + * Clear the quota flags, but remember them. This + * is so that the quota code doesn't get invoked + * before we're ready. This can happen when an + * inode goes inactive and wants to free blocks, + * or via xfs_log_mount_finish. + */ + *needquotamount = true; + *quotaflags = mp->m_qflags; + mp->m_qflags = 0; + } + } + + return 0; +} diff --git a/kernel/fs/xfs/xfs_qm_syscalls.c b/kernel/fs/xfs/xfs_qm_syscalls.c new file mode 100644 index 000000000..9a25c9275 --- /dev/null +++ b/kernel/fs/xfs/xfs_qm_syscalls.c @@ -0,0 +1,770 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include + +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_error.h" +#include "xfs_quota.h" +#include "xfs_qm.h" +#include "xfs_trace.h" +#include "xfs_icache.h" + +STATIC int xfs_qm_log_quotaoff(xfs_mount_t *, xfs_qoff_logitem_t **, uint); +STATIC int xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *, + uint); + +/* + * Turn off quota accounting and/or enforcement for all udquots and/or + * gdquots. Called only at unmount time. + * + * This assumes that there are no dquots of this file system cached + * incore, and modifies the ondisk dquot directly. Therefore, for example, + * it is an error to call this twice, without purging the cache. + */ +int +xfs_qm_scall_quotaoff( + xfs_mount_t *mp, + uint flags) +{ + struct xfs_quotainfo *q = mp->m_quotainfo; + uint dqtype; + int error; + uint inactivate_flags; + xfs_qoff_logitem_t *qoffstart; + + /* + * No file system can have quotas enabled on disk but not in core. + * Note that quota utilities (like quotaoff) _expect_ + * errno == -EEXIST here. + */ + if ((mp->m_qflags & flags) == 0) + return -EEXIST; + error = 0; + + flags &= (XFS_ALL_QUOTA_ACCT | XFS_ALL_QUOTA_ENFD); + + /* + * We don't want to deal with two quotaoffs messing up each other, + * so we're going to serialize it. quotaoff isn't exactly a performance + * critical thing. + * If quotaoff, then we must be dealing with the root filesystem. + */ + ASSERT(q); + mutex_lock(&q->qi_quotaofflock); + + /* + * If we're just turning off quota enforcement, change mp and go. + */ + if ((flags & XFS_ALL_QUOTA_ACCT) == 0) { + mp->m_qflags &= ~(flags); + + spin_lock(&mp->m_sb_lock); + mp->m_sb.sb_qflags = mp->m_qflags; + spin_unlock(&mp->m_sb_lock); + mutex_unlock(&q->qi_quotaofflock); + + /* XXX what to do if error ? Revert back to old vals incore ? */ + return xfs_sync_sb(mp, false); + } + + dqtype = 0; + inactivate_flags = 0; + /* + * If accounting is off, we must turn enforcement off, clear the + * quota 'CHKD' certificate to make it known that we have to + * do a quotacheck the next time this quota is turned on. + */ + if (flags & XFS_UQUOTA_ACCT) { + dqtype |= XFS_QMOPT_UQUOTA; + flags |= (XFS_UQUOTA_CHKD | XFS_UQUOTA_ENFD); + inactivate_flags |= XFS_UQUOTA_ACTIVE; + } + if (flags & XFS_GQUOTA_ACCT) { + dqtype |= XFS_QMOPT_GQUOTA; + flags |= (XFS_GQUOTA_CHKD | XFS_GQUOTA_ENFD); + inactivate_flags |= XFS_GQUOTA_ACTIVE; + } + if (flags & XFS_PQUOTA_ACCT) { + dqtype |= XFS_QMOPT_PQUOTA; + flags |= (XFS_PQUOTA_CHKD | XFS_PQUOTA_ENFD); + inactivate_flags |= XFS_PQUOTA_ACTIVE; + } + + /* + * Nothing to do? Don't complain. This happens when we're just + * turning off quota enforcement. + */ + if ((mp->m_qflags & flags) == 0) + goto out_unlock; + + /* + * Write the LI_QUOTAOFF log record, and do SB changes atomically, + * and synchronously. If we fail to write, we should abort the + * operation as it cannot be recovered safely if we crash. + */ + error = xfs_qm_log_quotaoff(mp, &qoffstart, flags); + if (error) + goto out_unlock; + + /* + * Next we clear the XFS_MOUNT_*DQ_ACTIVE bit(s) in the mount struct + * to take care of the race between dqget and quotaoff. We don't take + * any special locks to reset these bits. All processes need to check + * these bits *after* taking inode lock(s) to see if the particular + * quota type is in the process of being turned off. If *ACTIVE, it is + * guaranteed that all dquot structures and all quotainode ptrs will all + * stay valid as long as that inode is kept locked. + * + * There is no turning back after this. + */ + mp->m_qflags &= ~inactivate_flags; + + /* + * Give back all the dquot reference(s) held by inodes. + * Here we go thru every single incore inode in this file system, and + * do a dqrele on the i_udquot/i_gdquot that it may have. + * Essentially, as long as somebody has an inode locked, this guarantees + * that quotas will not be turned off. This is handy because in a + * transaction once we lock the inode(s) and check for quotaon, we can + * depend on the quota inodes (and other things) being valid as long as + * we keep the lock(s). + */ + xfs_qm_dqrele_all_inodes(mp, flags); + + /* + * Next we make the changes in the quota flag in the mount struct. + * This isn't protected by a particular lock directly, because we + * don't want to take a mrlock every time we depend on quotas being on. + */ + mp->m_qflags &= ~flags; + + /* + * Go through all the dquots of this file system and purge them, + * according to what was turned off. + */ + xfs_qm_dqpurge_all(mp, dqtype); + + /* + * Transactions that had started before ACTIVE state bit was cleared + * could have logged many dquots, so they'd have higher LSNs than + * the first QUOTAOFF log record does. If we happen to crash when + * the tail of the log has gone past the QUOTAOFF record, but + * before the last dquot modification, those dquots __will__ + * recover, and that's not good. + * + * So, we have QUOTAOFF start and end logitems; the start + * logitem won't get overwritten until the end logitem appears... + */ + error = xfs_qm_log_quotaoff_end(mp, qoffstart, flags); + if (error) { + /* We're screwed now. Shutdown is the only option. */ + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + goto out_unlock; + } + + /* + * If all quotas are completely turned off, close shop. + */ + if (mp->m_qflags == 0) { + mutex_unlock(&q->qi_quotaofflock); + xfs_qm_destroy_quotainfo(mp); + return 0; + } + + /* + * Release our quotainode references if we don't need them anymore. + */ + if ((dqtype & XFS_QMOPT_UQUOTA) && q->qi_uquotaip) { + IRELE(q->qi_uquotaip); + q->qi_uquotaip = NULL; + } + if ((dqtype & XFS_QMOPT_GQUOTA) && q->qi_gquotaip) { + IRELE(q->qi_gquotaip); + q->qi_gquotaip = NULL; + } + if ((dqtype & XFS_QMOPT_PQUOTA) && q->qi_pquotaip) { + IRELE(q->qi_pquotaip); + q->qi_pquotaip = NULL; + } + +out_unlock: + mutex_unlock(&q->qi_quotaofflock); + return error; +} + +STATIC int +xfs_qm_scall_trunc_qfile( + struct xfs_mount *mp, + xfs_ino_t ino) +{ + struct xfs_inode *ip; + struct xfs_trans *tp; + int error; + + if (ino == NULLFSINO) + return 0; + + error = xfs_iget(mp, NULL, ino, 0, 0, &ip); + if (error) + return error; + + xfs_ilock(ip, XFS_IOLOCK_EXCL); + + tp = xfs_trans_alloc(mp, XFS_TRANS_TRUNCATE_FILE); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); + if (error) { + xfs_trans_cancel(tp, 0); + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + goto out_put; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + + ip->i_d.di_size = 0; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0); + if (error) { + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | + XFS_TRANS_ABORT); + goto out_unlock; + } + + ASSERT(ip->i_d.di_nextents == 0); + + xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + +out_unlock: + xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); +out_put: + IRELE(ip); + return error; +} + +int +xfs_qm_scall_trunc_qfiles( + xfs_mount_t *mp, + uint flags) +{ + int error = -EINVAL; + + if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0 || + (flags & ~XFS_DQ_ALLTYPES)) { + xfs_debug(mp, "%s: flags=%x m_qflags=%x", + __func__, flags, mp->m_qflags); + return -EINVAL; + } + + if (flags & XFS_DQ_USER) { + error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_uquotino); + if (error) + return error; + } + if (flags & XFS_DQ_GROUP) { + error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_gquotino); + if (error) + return error; + } + if (flags & XFS_DQ_PROJ) + error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_pquotino); + + return error; +} + +/* + * Switch on (a given) quota enforcement for a filesystem. This takes + * effect immediately. + * (Switching on quota accounting must be done at mount time.) + */ +int +xfs_qm_scall_quotaon( + xfs_mount_t *mp, + uint flags) +{ + int error; + uint qf; + + flags &= (XFS_ALL_QUOTA_ACCT | XFS_ALL_QUOTA_ENFD); + /* + * Switching on quota accounting must be done at mount time. + */ + flags &= ~(XFS_ALL_QUOTA_ACCT); + + if (flags == 0) { + xfs_debug(mp, "%s: zero flags, m_qflags=%x", + __func__, mp->m_qflags); + return -EINVAL; + } + + /* + * Can't enforce without accounting. We check the superblock + * qflags here instead of m_qflags because rootfs can have + * quota acct on ondisk without m_qflags' knowing. + */ + if (((mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT) == 0 && + (flags & XFS_UQUOTA_ENFD)) || + ((mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) == 0 && + (flags & XFS_GQUOTA_ENFD)) || + ((mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT) == 0 && + (flags & XFS_PQUOTA_ENFD))) { + xfs_debug(mp, + "%s: Can't enforce without acct, flags=%x sbflags=%x", + __func__, flags, mp->m_sb.sb_qflags); + return -EINVAL; + } + /* + * If everything's up to-date incore, then don't waste time. + */ + if ((mp->m_qflags & flags) == flags) + return -EEXIST; + + /* + * Change sb_qflags on disk but not incore mp->qflags + * if this is the root filesystem. + */ + spin_lock(&mp->m_sb_lock); + qf = mp->m_sb.sb_qflags; + mp->m_sb.sb_qflags = qf | flags; + spin_unlock(&mp->m_sb_lock); + + /* + * There's nothing to change if it's the same. + */ + if ((qf & flags) == flags) + return -EEXIST; + + error = xfs_sync_sb(mp, false); + if (error) + return error; + /* + * If we aren't trying to switch on quota enforcement, we are done. + */ + if (((mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT) != + (mp->m_qflags & XFS_UQUOTA_ACCT)) || + ((mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT) != + (mp->m_qflags & XFS_PQUOTA_ACCT)) || + ((mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) != + (mp->m_qflags & XFS_GQUOTA_ACCT))) + return 0; + + if (! XFS_IS_QUOTA_RUNNING(mp)) + return -ESRCH; + + /* + * Switch on quota enforcement in core. + */ + mutex_lock(&mp->m_quotainfo->qi_quotaofflock); + mp->m_qflags |= (flags & XFS_ALL_QUOTA_ENFD); + mutex_unlock(&mp->m_quotainfo->qi_quotaofflock); + + return 0; +} + +#define XFS_QC_MASK \ + (QC_LIMIT_MASK | QC_TIMER_MASK | QC_WARNS_MASK) + +/* + * Adjust quota limits, and start/stop timers accordingly. + */ +int +xfs_qm_scall_setqlim( + struct xfs_mount *mp, + xfs_dqid_t id, + uint type, + struct qc_dqblk *newlim) +{ + struct xfs_quotainfo *q = mp->m_quotainfo; + struct xfs_disk_dquot *ddq; + struct xfs_dquot *dqp; + struct xfs_trans *tp; + int error; + xfs_qcnt_t hard, soft; + + if (newlim->d_fieldmask & ~XFS_QC_MASK) + return -EINVAL; + if ((newlim->d_fieldmask & XFS_QC_MASK) == 0) + return 0; + + /* + * We don't want to race with a quotaoff so take the quotaoff lock. + * We don't hold an inode lock, so there's nothing else to stop + * a quotaoff from happening. + */ + mutex_lock(&q->qi_quotaofflock); + + /* + * Get the dquot (locked) before we start, as we need to do a + * transaction to allocate it if it doesn't exist. Once we have the + * dquot, unlock it so we can start the next transaction safely. We hold + * a reference to the dquot, so it's safe to do this unlock/lock without + * it being reclaimed in the mean time. + */ + error = xfs_qm_dqget(mp, NULL, id, type, XFS_QMOPT_DQALLOC, &dqp); + if (error) { + ASSERT(error != -ENOENT); + goto out_unlock; + } + xfs_dqunlock(dqp); + + tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_setqlim, 0, 0); + if (error) { + xfs_trans_cancel(tp, 0); + goto out_rele; + } + + xfs_dqlock(dqp); + xfs_trans_dqjoin(tp, dqp); + ddq = &dqp->q_core; + + /* + * Make sure that hardlimits are >= soft limits before changing. + */ + hard = (newlim->d_fieldmask & QC_SPC_HARD) ? + (xfs_qcnt_t) XFS_B_TO_FSB(mp, newlim->d_spc_hardlimit) : + be64_to_cpu(ddq->d_blk_hardlimit); + soft = (newlim->d_fieldmask & QC_SPC_SOFT) ? + (xfs_qcnt_t) XFS_B_TO_FSB(mp, newlim->d_spc_softlimit) : + be64_to_cpu(ddq->d_blk_softlimit); + if (hard == 0 || hard >= soft) { + ddq->d_blk_hardlimit = cpu_to_be64(hard); + ddq->d_blk_softlimit = cpu_to_be64(soft); + xfs_dquot_set_prealloc_limits(dqp); + if (id == 0) { + q->qi_bhardlimit = hard; + q->qi_bsoftlimit = soft; + } + } else { + xfs_debug(mp, "blkhard %Ld < blksoft %Ld", hard, soft); + } + hard = (newlim->d_fieldmask & QC_RT_SPC_HARD) ? + (xfs_qcnt_t) XFS_B_TO_FSB(mp, newlim->d_rt_spc_hardlimit) : + be64_to_cpu(ddq->d_rtb_hardlimit); + soft = (newlim->d_fieldmask & QC_RT_SPC_SOFT) ? + (xfs_qcnt_t) XFS_B_TO_FSB(mp, newlim->d_rt_spc_softlimit) : + be64_to_cpu(ddq->d_rtb_softlimit); + if (hard == 0 || hard >= soft) { + ddq->d_rtb_hardlimit = cpu_to_be64(hard); + ddq->d_rtb_softlimit = cpu_to_be64(soft); + if (id == 0) { + q->qi_rtbhardlimit = hard; + q->qi_rtbsoftlimit = soft; + } + } else { + xfs_debug(mp, "rtbhard %Ld < rtbsoft %Ld", hard, soft); + } + + hard = (newlim->d_fieldmask & QC_INO_HARD) ? + (xfs_qcnt_t) newlim->d_ino_hardlimit : + be64_to_cpu(ddq->d_ino_hardlimit); + soft = (newlim->d_fieldmask & QC_INO_SOFT) ? + (xfs_qcnt_t) newlim->d_ino_softlimit : + be64_to_cpu(ddq->d_ino_softlimit); + if (hard == 0 || hard >= soft) { + ddq->d_ino_hardlimit = cpu_to_be64(hard); + ddq->d_ino_softlimit = cpu_to_be64(soft); + if (id == 0) { + q->qi_ihardlimit = hard; + q->qi_isoftlimit = soft; + } + } else { + xfs_debug(mp, "ihard %Ld < isoft %Ld", hard, soft); + } + + /* + * Update warnings counter(s) if requested + */ + if (newlim->d_fieldmask & QC_SPC_WARNS) + ddq->d_bwarns = cpu_to_be16(newlim->d_spc_warns); + if (newlim->d_fieldmask & QC_INO_WARNS) + ddq->d_iwarns = cpu_to_be16(newlim->d_ino_warns); + if (newlim->d_fieldmask & QC_RT_SPC_WARNS) + ddq->d_rtbwarns = cpu_to_be16(newlim->d_rt_spc_warns); + + if (id == 0) { + /* + * Timelimits for the super user set the relative time + * the other users can be over quota for this file system. + * If it is zero a default is used. Ditto for the default + * soft and hard limit values (already done, above), and + * for warnings. + */ + if (newlim->d_fieldmask & QC_SPC_TIMER) { + q->qi_btimelimit = newlim->d_spc_timer; + ddq->d_btimer = cpu_to_be32(newlim->d_spc_timer); + } + if (newlim->d_fieldmask & QC_INO_TIMER) { + q->qi_itimelimit = newlim->d_ino_timer; + ddq->d_itimer = cpu_to_be32(newlim->d_ino_timer); + } + if (newlim->d_fieldmask & QC_RT_SPC_TIMER) { + q->qi_rtbtimelimit = newlim->d_rt_spc_timer; + ddq->d_rtbtimer = cpu_to_be32(newlim->d_rt_spc_timer); + } + if (newlim->d_fieldmask & QC_SPC_WARNS) + q->qi_bwarnlimit = newlim->d_spc_warns; + if (newlim->d_fieldmask & QC_INO_WARNS) + q->qi_iwarnlimit = newlim->d_ino_warns; + if (newlim->d_fieldmask & QC_RT_SPC_WARNS) + q->qi_rtbwarnlimit = newlim->d_rt_spc_warns; + } else { + /* + * If the user is now over quota, start the timelimit. + * The user will not be 'warned'. + * Note that we keep the timers ticking, whether enforcement + * is on or off. We don't really want to bother with iterating + * over all ondisk dquots and turning the timers on/off. + */ + xfs_qm_adjust_dqtimers(mp, ddq); + } + dqp->dq_flags |= XFS_DQ_DIRTY; + xfs_trans_log_dquot(tp, dqp); + + error = xfs_trans_commit(tp, 0); + +out_rele: + xfs_qm_dqrele(dqp); +out_unlock: + mutex_unlock(&q->qi_quotaofflock); + return error; +} + +STATIC int +xfs_qm_log_quotaoff_end( + xfs_mount_t *mp, + xfs_qoff_logitem_t *startqoff, + uint flags) +{ + xfs_trans_t *tp; + int error; + xfs_qoff_logitem_t *qoffi; + + tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF_END); + + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_equotaoff, 0, 0); + if (error) { + xfs_trans_cancel(tp, 0); + return error; + } + + qoffi = xfs_trans_get_qoff_item(tp, startqoff, + flags & XFS_ALL_QUOTA_ACCT); + xfs_trans_log_quotaoff_item(tp, qoffi); + + /* + * We have to make sure that the transaction is secure on disk before we + * return and actually stop quota accounting. So, make it synchronous. + * We don't care about quotoff's performance. + */ + xfs_trans_set_sync(tp); + error = xfs_trans_commit(tp, 0); + return error; +} + + +STATIC int +xfs_qm_log_quotaoff( + xfs_mount_t *mp, + xfs_qoff_logitem_t **qoffstartp, + uint flags) +{ + xfs_trans_t *tp; + int error; + xfs_qoff_logitem_t *qoffi; + + *qoffstartp = NULL; + + tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_quotaoff, 0, 0); + if (error) { + xfs_trans_cancel(tp, 0); + goto out; + } + + qoffi = xfs_trans_get_qoff_item(tp, NULL, flags & XFS_ALL_QUOTA_ACCT); + xfs_trans_log_quotaoff_item(tp, qoffi); + + spin_lock(&mp->m_sb_lock); + mp->m_sb.sb_qflags = (mp->m_qflags & ~(flags)) & XFS_MOUNT_QUOTA_ALL; + spin_unlock(&mp->m_sb_lock); + + xfs_log_sb(tp); + + /* + * We have to make sure that the transaction is secure on disk before we + * return and actually stop quota accounting. So, make it synchronous. + * We don't care about quotoff's performance. + */ + xfs_trans_set_sync(tp); + error = xfs_trans_commit(tp, 0); + if (error) + goto out; + + *qoffstartp = qoffi; +out: + return error; +} + + +int +xfs_qm_scall_getquota( + struct xfs_mount *mp, + xfs_dqid_t id, + uint type, + struct qc_dqblk *dst) +{ + struct xfs_dquot *dqp; + int error; + + /* + * Try to get the dquot. We don't want it allocated on disk, so + * we aren't passing the XFS_QMOPT_DOALLOC flag. If it doesn't + * exist, we'll get ENOENT back. + */ + error = xfs_qm_dqget(mp, NULL, id, type, 0, &dqp); + if (error) + return error; + + /* + * If everything's NULL, this dquot doesn't quite exist as far as + * our utility programs are concerned. + */ + if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) { + error = -ENOENT; + goto out_put; + } + + memset(dst, 0, sizeof(*dst)); + dst->d_spc_hardlimit = + XFS_FSB_TO_B(mp, be64_to_cpu(dqp->q_core.d_blk_hardlimit)); + dst->d_spc_softlimit = + XFS_FSB_TO_B(mp, be64_to_cpu(dqp->q_core.d_blk_softlimit)); + dst->d_ino_hardlimit = be64_to_cpu(dqp->q_core.d_ino_hardlimit); + dst->d_ino_softlimit = be64_to_cpu(dqp->q_core.d_ino_softlimit); + dst->d_space = XFS_FSB_TO_B(mp, dqp->q_res_bcount); + dst->d_ino_count = dqp->q_res_icount; + dst->d_spc_timer = be32_to_cpu(dqp->q_core.d_btimer); + dst->d_ino_timer = be32_to_cpu(dqp->q_core.d_itimer); + dst->d_ino_warns = be16_to_cpu(dqp->q_core.d_iwarns); + dst->d_spc_warns = be16_to_cpu(dqp->q_core.d_bwarns); + dst->d_rt_spc_hardlimit = + XFS_FSB_TO_B(mp, be64_to_cpu(dqp->q_core.d_rtb_hardlimit)); + dst->d_rt_spc_softlimit = + XFS_FSB_TO_B(mp, be64_to_cpu(dqp->q_core.d_rtb_softlimit)); + dst->d_rt_space = XFS_FSB_TO_B(mp, dqp->q_res_rtbcount); + dst->d_rt_spc_timer = be32_to_cpu(dqp->q_core.d_rtbtimer); + dst->d_rt_spc_warns = be16_to_cpu(dqp->q_core.d_rtbwarns); + + /* + * Internally, we don't reset all the timers when quota enforcement + * gets turned off. No need to confuse the user level code, + * so return zeroes in that case. + */ + if ((!XFS_IS_UQUOTA_ENFORCED(mp) && + dqp->q_core.d_flags == XFS_DQ_USER) || + (!XFS_IS_GQUOTA_ENFORCED(mp) && + dqp->q_core.d_flags == XFS_DQ_GROUP) || + (!XFS_IS_PQUOTA_ENFORCED(mp) && + dqp->q_core.d_flags == XFS_DQ_PROJ)) { + dst->d_spc_timer = 0; + dst->d_ino_timer = 0; + dst->d_rt_spc_timer = 0; + } + +#ifdef DEBUG + if (((XFS_IS_UQUOTA_ENFORCED(mp) && type == XFS_DQ_USER) || + (XFS_IS_GQUOTA_ENFORCED(mp) && type == XFS_DQ_GROUP) || + (XFS_IS_PQUOTA_ENFORCED(mp) && type == XFS_DQ_PROJ)) && + id != 0) { + if ((dst->d_space > dst->d_spc_softlimit) && + (dst->d_spc_softlimit > 0)) { + ASSERT(dst->d_spc_timer != 0); + } + if ((dst->d_ino_count > dst->d_ino_softlimit) && + (dst->d_ino_softlimit > 0)) { + ASSERT(dst->d_ino_timer != 0); + } + } +#endif +out_put: + xfs_qm_dqput(dqp); + return error; +} + + +STATIC int +xfs_dqrele_inode( + struct xfs_inode *ip, + int flags, + void *args) +{ + /* skip quota inodes */ + if (ip == ip->i_mount->m_quotainfo->qi_uquotaip || + ip == ip->i_mount->m_quotainfo->qi_gquotaip || + ip == ip->i_mount->m_quotainfo->qi_pquotaip) { + ASSERT(ip->i_udquot == NULL); + ASSERT(ip->i_gdquot == NULL); + ASSERT(ip->i_pdquot == NULL); + return 0; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + if ((flags & XFS_UQUOTA_ACCT) && ip->i_udquot) { + xfs_qm_dqrele(ip->i_udquot); + ip->i_udquot = NULL; + } + if ((flags & XFS_GQUOTA_ACCT) && ip->i_gdquot) { + xfs_qm_dqrele(ip->i_gdquot); + ip->i_gdquot = NULL; + } + if ((flags & XFS_PQUOTA_ACCT) && ip->i_pdquot) { + xfs_qm_dqrele(ip->i_pdquot); + ip->i_pdquot = NULL; + } + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return 0; +} + + +/* + * Go thru all the inodes in the file system, releasing their dquots. + * + * Note that the mount structure gets modified to indicate that quotas are off + * AFTER this, in the case of quotaoff. + */ +void +xfs_qm_dqrele_all_inodes( + struct xfs_mount *mp, + uint flags) +{ + ASSERT(mp->m_quotainfo); + xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, NULL); +} diff --git a/kernel/fs/xfs/xfs_quota.h b/kernel/fs/xfs/xfs_quota.h new file mode 100644 index 000000000..5376dd406 --- /dev/null +++ b/kernel/fs/xfs/xfs_quota.h @@ -0,0 +1,155 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_QUOTA_H__ +#define __XFS_QUOTA_H__ + +#include "xfs_quota_defs.h" + +/* + * Kernel only quota definitions and functions + */ + +struct xfs_trans; + +/* + * This check is done typically without holding the inode lock; + * that may seem racy, but it is harmless in the context that it is used. + * The inode cannot go inactive as long a reference is kept, and + * therefore if dquot(s) were attached, they'll stay consistent. + * If, for example, the ownership of the inode changes while + * we didn't have the inode locked, the appropriate dquot(s) will be + * attached atomically. + */ +#define XFS_NOT_DQATTACHED(mp, ip) \ + ((XFS_IS_UQUOTA_ON(mp) && (ip)->i_udquot == NULL) || \ + (XFS_IS_GQUOTA_ON(mp) && (ip)->i_gdquot == NULL) || \ + (XFS_IS_PQUOTA_ON(mp) && (ip)->i_pdquot == NULL)) + +#define XFS_QM_NEED_QUOTACHECK(mp) \ + ((XFS_IS_UQUOTA_ON(mp) && \ + (mp->m_sb.sb_qflags & XFS_UQUOTA_CHKD) == 0) || \ + (XFS_IS_GQUOTA_ON(mp) && \ + (mp->m_sb.sb_qflags & XFS_GQUOTA_CHKD) == 0) || \ + (XFS_IS_PQUOTA_ON(mp) && \ + (mp->m_sb.sb_qflags & XFS_PQUOTA_CHKD) == 0)) + +/* + * The structure kept inside the xfs_trans_t keep track of dquot changes + * within a transaction and apply them later. + */ +typedef struct xfs_dqtrx { + struct xfs_dquot *qt_dquot; /* the dquot this refers to */ + ulong qt_blk_res; /* blks reserved on a dquot */ + ulong qt_blk_res_used; /* blks used from the reservation */ + ulong qt_ino_res; /* inode reserved on a dquot */ + ulong qt_ino_res_used; /* inodes used from the reservation */ + long qt_bcount_delta; /* dquot blk count changes */ + long qt_delbcnt_delta; /* delayed dquot blk count changes */ + long qt_icount_delta; /* dquot inode count changes */ + ulong qt_rtblk_res; /* # blks reserved on a dquot */ + ulong qt_rtblk_res_used;/* # blks used from reservation */ + long qt_rtbcount_delta;/* dquot realtime blk changes */ + long qt_delrtb_delta; /* delayed RT blk count changes */ +} xfs_dqtrx_t; + +#ifdef CONFIG_XFS_QUOTA +extern void xfs_trans_dup_dqinfo(struct xfs_trans *, struct xfs_trans *); +extern void xfs_trans_free_dqinfo(struct xfs_trans *); +extern void xfs_trans_mod_dquot_byino(struct xfs_trans *, struct xfs_inode *, + uint, long); +extern void xfs_trans_apply_dquot_deltas(struct xfs_trans *); +extern void xfs_trans_unreserve_and_mod_dquots(struct xfs_trans *); +extern int xfs_trans_reserve_quota_nblks(struct xfs_trans *, + struct xfs_inode *, long, long, uint); +extern int xfs_trans_reserve_quota_bydquots(struct xfs_trans *, + struct xfs_mount *, struct xfs_dquot *, + struct xfs_dquot *, struct xfs_dquot *, long, long, uint); + +extern int xfs_qm_vop_dqalloc(struct xfs_inode *, xfs_dqid_t, xfs_dqid_t, + prid_t, uint, struct xfs_dquot **, struct xfs_dquot **, + struct xfs_dquot **); +extern void xfs_qm_vop_create_dqattach(struct xfs_trans *, struct xfs_inode *, + struct xfs_dquot *, struct xfs_dquot *, struct xfs_dquot *); +extern int xfs_qm_vop_rename_dqattach(struct xfs_inode **); +extern struct xfs_dquot *xfs_qm_vop_chown(struct xfs_trans *, + struct xfs_inode *, struct xfs_dquot **, struct xfs_dquot *); +extern int xfs_qm_vop_chown_reserve(struct xfs_trans *, struct xfs_inode *, + struct xfs_dquot *, struct xfs_dquot *, + struct xfs_dquot *, uint); +extern int xfs_qm_dqattach(struct xfs_inode *, uint); +extern int xfs_qm_dqattach_locked(struct xfs_inode *, uint); +extern void xfs_qm_dqdetach(struct xfs_inode *); +extern void xfs_qm_dqrele(struct xfs_dquot *); +extern void xfs_qm_statvfs(struct xfs_inode *, struct kstatfs *); +extern int xfs_qm_newmount(struct xfs_mount *, uint *, uint *); +extern void xfs_qm_mount_quotas(struct xfs_mount *); +extern void xfs_qm_unmount(struct xfs_mount *); +extern void xfs_qm_unmount_quotas(struct xfs_mount *); + +#else +static inline int +xfs_qm_vop_dqalloc(struct xfs_inode *ip, xfs_dqid_t uid, xfs_dqid_t gid, + prid_t prid, uint flags, struct xfs_dquot **udqp, + struct xfs_dquot **gdqp, struct xfs_dquot **pdqp) +{ + *udqp = NULL; + *gdqp = NULL; + *pdqp = NULL; + return 0; +} +#define xfs_trans_dup_dqinfo(tp, tp2) +#define xfs_trans_free_dqinfo(tp) +#define xfs_trans_mod_dquot_byino(tp, ip, fields, delta) +#define xfs_trans_apply_dquot_deltas(tp) +#define xfs_trans_unreserve_and_mod_dquots(tp) +static inline int xfs_trans_reserve_quota_nblks(struct xfs_trans *tp, + struct xfs_inode *ip, long nblks, long ninos, uint flags) +{ + return 0; +} +static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp, + struct xfs_mount *mp, struct xfs_dquot *udqp, + struct xfs_dquot *gdqp, struct xfs_dquot *pdqp, + long nblks, long nions, uint flags) +{ + return 0; +} +#define xfs_qm_vop_create_dqattach(tp, ip, u, g, p) +#define xfs_qm_vop_rename_dqattach(it) (0) +#define xfs_qm_vop_chown(tp, ip, old, new) (NULL) +#define xfs_qm_vop_chown_reserve(tp, ip, u, g, p, fl) (0) +#define xfs_qm_dqattach(ip, fl) (0) +#define xfs_qm_dqattach_locked(ip, fl) (0) +#define xfs_qm_dqdetach(ip) +#define xfs_qm_dqrele(d) +#define xfs_qm_statvfs(ip, s) +#define xfs_qm_newmount(mp, a, b) (0) +#define xfs_qm_mount_quotas(mp) +#define xfs_qm_unmount(mp) +#define xfs_qm_unmount_quotas(mp) +#endif /* CONFIG_XFS_QUOTA */ + +#define xfs_trans_unreserve_quota_nblks(tp, ip, nblks, ninos, flags) \ + xfs_trans_reserve_quota_nblks(tp, ip, -(nblks), -(ninos), flags) +#define xfs_trans_reserve_quota(tp, mp, ud, gd, pd, nb, ni, f) \ + xfs_trans_reserve_quota_bydquots(tp, mp, ud, gd, pd, nb, ni, \ + f | XFS_QMOPT_RES_REGBLKS) + +extern int xfs_mount_reset_sbqflags(struct xfs_mount *); + +#endif /* __XFS_QUOTA_H__ */ diff --git a/kernel/fs/xfs/xfs_quotaops.c b/kernel/fs/xfs/xfs_quotaops.c new file mode 100644 index 000000000..7795e0d01 --- /dev/null +++ b/kernel/fs/xfs/xfs_quotaops.c @@ -0,0 +1,271 @@ +/* + * Copyright (c) 2008, Christoph Hellwig + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_quota.h" +#include "xfs_trans.h" +#include "xfs_trace.h" +#include "xfs_icache.h" +#include "xfs_qm.h" +#include + + +static void +xfs_qm_fill_state( + struct qc_type_state *tstate, + struct xfs_mount *mp, + struct xfs_inode *ip, + xfs_ino_t ino) +{ + struct xfs_quotainfo *q = mp->m_quotainfo; + bool tempqip = false; + + tstate->ino = ino; + if (!ip && ino == NULLFSINO) + return; + if (!ip) { + if (xfs_iget(mp, NULL, ino, 0, 0, &ip)) + return; + tempqip = true; + } + tstate->flags |= QCI_SYSFILE; + tstate->blocks = ip->i_d.di_nblocks; + tstate->nextents = ip->i_d.di_nextents; + tstate->spc_timelimit = q->qi_btimelimit; + tstate->ino_timelimit = q->qi_itimelimit; + tstate->rt_spc_timelimit = q->qi_rtbtimelimit; + tstate->spc_warnlimit = q->qi_bwarnlimit; + tstate->ino_warnlimit = q->qi_iwarnlimit; + tstate->rt_spc_warnlimit = q->qi_rtbwarnlimit; + if (tempqip) + IRELE(ip); +} + +/* + * Return quota status information, such as enforcements, quota file inode + * numbers etc. + */ +static int +xfs_fs_get_quota_state( + struct super_block *sb, + struct qc_state *state) +{ + struct xfs_mount *mp = XFS_M(sb); + struct xfs_quotainfo *q = mp->m_quotainfo; + + memset(state, 0, sizeof(*state)); + if (!XFS_IS_QUOTA_RUNNING(mp)) + return 0; + state->s_incoredqs = q->qi_dquots; + if (XFS_IS_UQUOTA_RUNNING(mp)) + state->s_state[USRQUOTA].flags |= QCI_ACCT_ENABLED; + if (XFS_IS_UQUOTA_ENFORCED(mp)) + state->s_state[USRQUOTA].flags |= QCI_LIMITS_ENFORCED; + if (XFS_IS_GQUOTA_RUNNING(mp)) + state->s_state[GRPQUOTA].flags |= QCI_ACCT_ENABLED; + if (XFS_IS_GQUOTA_ENFORCED(mp)) + state->s_state[GRPQUOTA].flags |= QCI_LIMITS_ENFORCED; + if (XFS_IS_PQUOTA_RUNNING(mp)) + state->s_state[PRJQUOTA].flags |= QCI_ACCT_ENABLED; + if (XFS_IS_PQUOTA_ENFORCED(mp)) + state->s_state[PRJQUOTA].flags |= QCI_LIMITS_ENFORCED; + + xfs_qm_fill_state(&state->s_state[USRQUOTA], mp, q->qi_uquotaip, + mp->m_sb.sb_uquotino); + xfs_qm_fill_state(&state->s_state[GRPQUOTA], mp, q->qi_gquotaip, + mp->m_sb.sb_gquotino); + xfs_qm_fill_state(&state->s_state[PRJQUOTA], mp, q->qi_pquotaip, + mp->m_sb.sb_pquotino); + return 0; +} + +STATIC int +xfs_quota_type(int type) +{ + switch (type) { + case USRQUOTA: + return XFS_DQ_USER; + case GRPQUOTA: + return XFS_DQ_GROUP; + default: + return XFS_DQ_PROJ; + } +} + +#define XFS_QC_SETINFO_MASK (QC_TIMER_MASK | QC_WARNS_MASK) + +/* + * Adjust quota timers & warnings + */ +static int +xfs_fs_set_info( + struct super_block *sb, + int type, + struct qc_info *info) +{ + struct xfs_mount *mp = XFS_M(sb); + struct qc_dqblk newlim; + + if (sb->s_flags & MS_RDONLY) + return -EROFS; + if (!XFS_IS_QUOTA_RUNNING(mp)) + return -ENOSYS; + if (!XFS_IS_QUOTA_ON(mp)) + return -ESRCH; + if (info->i_fieldmask & ~XFS_QC_SETINFO_MASK) + return -EINVAL; + if ((info->i_fieldmask & XFS_QC_SETINFO_MASK) == 0) + return 0; + + newlim.d_fieldmask = info->i_fieldmask; + newlim.d_spc_timer = info->i_spc_timelimit; + newlim.d_ino_timer = info->i_ino_timelimit; + newlim.d_rt_spc_timer = info->i_rt_spc_timelimit; + newlim.d_ino_warns = info->i_ino_warnlimit; + newlim.d_spc_warns = info->i_spc_warnlimit; + newlim.d_rt_spc_warns = info->i_rt_spc_warnlimit; + + return xfs_qm_scall_setqlim(mp, 0, xfs_quota_type(type), &newlim); +} + +static unsigned int +xfs_quota_flags(unsigned int uflags) +{ + unsigned int flags = 0; + + if (uflags & FS_QUOTA_UDQ_ACCT) + flags |= XFS_UQUOTA_ACCT; + if (uflags & FS_QUOTA_PDQ_ACCT) + flags |= XFS_PQUOTA_ACCT; + if (uflags & FS_QUOTA_GDQ_ACCT) + flags |= XFS_GQUOTA_ACCT; + if (uflags & FS_QUOTA_UDQ_ENFD) + flags |= XFS_UQUOTA_ENFD; + if (uflags & FS_QUOTA_GDQ_ENFD) + flags |= XFS_GQUOTA_ENFD; + if (uflags & FS_QUOTA_PDQ_ENFD) + flags |= XFS_PQUOTA_ENFD; + + return flags; +} + +STATIC int +xfs_quota_enable( + struct super_block *sb, + unsigned int uflags) +{ + struct xfs_mount *mp = XFS_M(sb); + + if (sb->s_flags & MS_RDONLY) + return -EROFS; + if (!XFS_IS_QUOTA_RUNNING(mp)) + return -ENOSYS; + + return xfs_qm_scall_quotaon(mp, xfs_quota_flags(uflags)); +} + +STATIC int +xfs_quota_disable( + struct super_block *sb, + unsigned int uflags) +{ + struct xfs_mount *mp = XFS_M(sb); + + if (sb->s_flags & MS_RDONLY) + return -EROFS; + if (!XFS_IS_QUOTA_RUNNING(mp)) + return -ENOSYS; + if (!XFS_IS_QUOTA_ON(mp)) + return -EINVAL; + + return xfs_qm_scall_quotaoff(mp, xfs_quota_flags(uflags)); +} + +STATIC int +xfs_fs_rm_xquota( + struct super_block *sb, + unsigned int uflags) +{ + struct xfs_mount *mp = XFS_M(sb); + unsigned int flags = 0; + + if (sb->s_flags & MS_RDONLY) + return -EROFS; + + if (XFS_IS_QUOTA_ON(mp)) + return -EINVAL; + + if (uflags & FS_USER_QUOTA) + flags |= XFS_DQ_USER; + if (uflags & FS_GROUP_QUOTA) + flags |= XFS_DQ_GROUP; + if (uflags & FS_PROJ_QUOTA) + flags |= XFS_DQ_PROJ; + + return xfs_qm_scall_trunc_qfiles(mp, flags); +} + +STATIC int +xfs_fs_get_dqblk( + struct super_block *sb, + struct kqid qid, + struct qc_dqblk *qdq) +{ + struct xfs_mount *mp = XFS_M(sb); + + if (!XFS_IS_QUOTA_RUNNING(mp)) + return -ENOSYS; + if (!XFS_IS_QUOTA_ON(mp)) + return -ESRCH; + + return xfs_qm_scall_getquota(mp, from_kqid(&init_user_ns, qid), + xfs_quota_type(qid.type), qdq); +} + +STATIC int +xfs_fs_set_dqblk( + struct super_block *sb, + struct kqid qid, + struct qc_dqblk *qdq) +{ + struct xfs_mount *mp = XFS_M(sb); + + if (sb->s_flags & MS_RDONLY) + return -EROFS; + if (!XFS_IS_QUOTA_RUNNING(mp)) + return -ENOSYS; + if (!XFS_IS_QUOTA_ON(mp)) + return -ESRCH; + + return xfs_qm_scall_setqlim(mp, from_kqid(&init_user_ns, qid), + xfs_quota_type(qid.type), qdq); +} + +const struct quotactl_ops xfs_quotactl_operations = { + .get_state = xfs_fs_get_quota_state, + .set_info = xfs_fs_set_info, + .quota_enable = xfs_quota_enable, + .quota_disable = xfs_quota_disable, + .rm_xquota = xfs_fs_rm_xquota, + .get_dqblk = xfs_fs_get_dqblk, + .set_dqblk = xfs_fs_set_dqblk, +}; diff --git a/kernel/fs/xfs/xfs_rtalloc.c b/kernel/fs/xfs/xfs_rtalloc.c new file mode 100644 index 000000000..f2079b691 --- /dev/null +++ b/kernel/fs/xfs/xfs_rtalloc.c @@ -0,0 +1,1302 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_bmap.h" +#include "xfs_bmap_util.h" +#include "xfs_bmap_btree.h" +#include "xfs_alloc.h" +#include "xfs_error.h" +#include "xfs_trans.h" +#include "xfs_trans_space.h" +#include "xfs_trace.h" +#include "xfs_buf.h" +#include "xfs_icache.h" +#include "xfs_rtalloc.h" + + +/* + * Read and return the summary information for a given extent size, + * bitmap block combination. + * Keeps track of a current summary block, so we don't keep reading + * it from the buffer cache. + */ +static int +xfs_rtget_summary( + xfs_mount_t *mp, /* file system mount structure */ + xfs_trans_t *tp, /* transaction pointer */ + int log, /* log2 of extent size */ + xfs_rtblock_t bbno, /* bitmap block number */ + xfs_buf_t **rbpp, /* in/out: summary block buffer */ + xfs_fsblock_t *rsb, /* in/out: summary block number */ + xfs_suminfo_t *sum) /* out: summary info for this block */ +{ + return xfs_rtmodify_summary_int(mp, tp, log, bbno, 0, rbpp, rsb, sum); +} + +/* + * Return whether there are any free extents in the size range given + * by low and high, for the bitmap block bbno. + */ +STATIC int /* error */ +xfs_rtany_summary( + xfs_mount_t *mp, /* file system mount structure */ + xfs_trans_t *tp, /* transaction pointer */ + int low, /* low log2 extent size */ + int high, /* high log2 extent size */ + xfs_rtblock_t bbno, /* bitmap block number */ + xfs_buf_t **rbpp, /* in/out: summary block buffer */ + xfs_fsblock_t *rsb, /* in/out: summary block number */ + int *stat) /* out: any good extents here? */ +{ + int error; /* error value */ + int log; /* loop counter, log2 of ext. size */ + xfs_suminfo_t sum; /* summary data */ + + /* + * Loop over logs of extent sizes. Order is irrelevant. + */ + for (log = low; log <= high; log++) { + /* + * Get one summary datum. + */ + error = xfs_rtget_summary(mp, tp, log, bbno, rbpp, rsb, &sum); + if (error) { + return error; + } + /* + * If there are any, return success. + */ + if (sum) { + *stat = 1; + return 0; + } + } + /* + * Found nothing, return failure. + */ + *stat = 0; + return 0; +} + + +/* + * Copy and transform the summary file, given the old and new + * parameters in the mount structures. + */ +STATIC int /* error */ +xfs_rtcopy_summary( + xfs_mount_t *omp, /* old file system mount point */ + xfs_mount_t *nmp, /* new file system mount point */ + xfs_trans_t *tp) /* transaction pointer */ +{ + xfs_rtblock_t bbno; /* bitmap block number */ + xfs_buf_t *bp; /* summary buffer */ + int error; /* error return value */ + int log; /* summary level number (log length) */ + xfs_suminfo_t sum; /* summary data */ + xfs_fsblock_t sumbno; /* summary block number */ + + bp = NULL; + for (log = omp->m_rsumlevels - 1; log >= 0; log--) { + for (bbno = omp->m_sb.sb_rbmblocks - 1; + (xfs_srtblock_t)bbno >= 0; + bbno--) { + error = xfs_rtget_summary(omp, tp, log, bbno, &bp, + &sumbno, &sum); + if (error) + return error; + if (sum == 0) + continue; + error = xfs_rtmodify_summary(omp, tp, log, bbno, -sum, + &bp, &sumbno); + if (error) + return error; + error = xfs_rtmodify_summary(nmp, tp, log, bbno, sum, + &bp, &sumbno); + if (error) + return error; + ASSERT(sum > 0); + } + } + return 0; +} +/* + * Mark an extent specified by start and len allocated. + * Updates all the summary information as well as the bitmap. + */ +STATIC int /* error */ +xfs_rtallocate_range( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_rtblock_t start, /* start block to allocate */ + xfs_extlen_t len, /* length to allocate */ + xfs_buf_t **rbpp, /* in/out: summary block buffer */ + xfs_fsblock_t *rsb) /* in/out: summary block number */ +{ + xfs_rtblock_t end; /* end of the allocated extent */ + int error; /* error value */ + xfs_rtblock_t postblock = 0; /* first block allocated > end */ + xfs_rtblock_t preblock = 0; /* first block allocated < start */ + + end = start + len - 1; + /* + * Assume we're allocating out of the middle of a free extent. + * We need to find the beginning and end of the extent so we can + * properly update the summary. + */ + error = xfs_rtfind_back(mp, tp, start, 0, &preblock); + if (error) { + return error; + } + /* + * Find the next allocated block (end of free extent). + */ + error = xfs_rtfind_forw(mp, tp, end, mp->m_sb.sb_rextents - 1, + &postblock); + if (error) { + return error; + } + /* + * Decrement the summary information corresponding to the entire + * (old) free extent. + */ + error = xfs_rtmodify_summary(mp, tp, + XFS_RTBLOCKLOG(postblock + 1 - preblock), + XFS_BITTOBLOCK(mp, preblock), -1, rbpp, rsb); + if (error) { + return error; + } + /* + * If there are blocks not being allocated at the front of the + * old extent, add summary data for them to be free. + */ + if (preblock < start) { + error = xfs_rtmodify_summary(mp, tp, + XFS_RTBLOCKLOG(start - preblock), + XFS_BITTOBLOCK(mp, preblock), 1, rbpp, rsb); + if (error) { + return error; + } + } + /* + * If there are blocks not being allocated at the end of the + * old extent, add summary data for them to be free. + */ + if (postblock > end) { + error = xfs_rtmodify_summary(mp, tp, + XFS_RTBLOCKLOG(postblock - end), + XFS_BITTOBLOCK(mp, end + 1), 1, rbpp, rsb); + if (error) { + return error; + } + } + /* + * Modify the bitmap to mark this extent allocated. + */ + error = xfs_rtmodify_range(mp, tp, start, len, 0); + return error; +} + +/* + * Attempt to allocate an extent minlen<=len<=maxlen starting from + * bitmap block bbno. If we don't get maxlen then use prod to trim + * the length, if given. Returns error; returns starting block in *rtblock. + * The lengths are all in rtextents. + */ +STATIC int /* error */ +xfs_rtallocate_extent_block( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_rtblock_t bbno, /* bitmap block number */ + xfs_extlen_t minlen, /* minimum length to allocate */ + xfs_extlen_t maxlen, /* maximum length to allocate */ + xfs_extlen_t *len, /* out: actual length allocated */ + xfs_rtblock_t *nextp, /* out: next block to try */ + xfs_buf_t **rbpp, /* in/out: summary block buffer */ + xfs_fsblock_t *rsb, /* in/out: summary block number */ + xfs_extlen_t prod, /* extent product factor */ + xfs_rtblock_t *rtblock) /* out: start block allocated */ +{ + xfs_rtblock_t besti; /* best rtblock found so far */ + xfs_rtblock_t bestlen; /* best length found so far */ + xfs_rtblock_t end; /* last rtblock in chunk */ + int error; /* error value */ + xfs_rtblock_t i; /* current rtblock trying */ + xfs_rtblock_t next; /* next rtblock to try */ + int stat; /* status from internal calls */ + + /* + * Loop over all the extents starting in this bitmap block, + * looking for one that's long enough. + */ + for (i = XFS_BLOCKTOBIT(mp, bbno), besti = -1, bestlen = 0, + end = XFS_BLOCKTOBIT(mp, bbno + 1) - 1; + i <= end; + i++) { + /* + * See if there's a free extent of maxlen starting at i. + * If it's not so then next will contain the first non-free. + */ + error = xfs_rtcheck_range(mp, tp, i, maxlen, 1, &next, &stat); + if (error) { + return error; + } + if (stat) { + /* + * i for maxlen is all free, allocate and return that. + */ + error = xfs_rtallocate_range(mp, tp, i, maxlen, rbpp, + rsb); + if (error) { + return error; + } + *len = maxlen; + *rtblock = i; + return 0; + } + /* + * In the case where we have a variable-sized allocation + * request, figure out how big this free piece is, + * and if it's big enough for the minimum, and the best + * so far, remember it. + */ + if (minlen < maxlen) { + xfs_rtblock_t thislen; /* this extent size */ + + thislen = next - i; + if (thislen >= minlen && thislen > bestlen) { + besti = i; + bestlen = thislen; + } + } + /* + * If not done yet, find the start of the next free space. + */ + if (next < end) { + error = xfs_rtfind_forw(mp, tp, next, end, &i); + if (error) { + return error; + } + } else + break; + } + /* + * Searched the whole thing & didn't find a maxlen free extent. + */ + if (minlen < maxlen && besti != -1) { + xfs_extlen_t p; /* amount to trim length by */ + + /* + * If size should be a multiple of prod, make that so. + */ + if (prod > 1 && (p = do_mod(bestlen, prod))) + bestlen -= p; + /* + * Allocate besti for bestlen & return that. + */ + error = xfs_rtallocate_range(mp, tp, besti, bestlen, rbpp, rsb); + if (error) { + return error; + } + *len = bestlen; + *rtblock = besti; + return 0; + } + /* + * Allocation failed. Set *nextp to the next block to try. + */ + *nextp = next; + *rtblock = NULLRTBLOCK; + return 0; +} + +/* + * Allocate an extent of length minlen<=len<=maxlen, starting at block + * bno. If we don't get maxlen then use prod to trim the length, if given. + * Returns error; returns starting block in *rtblock. + * The lengths are all in rtextents. + */ +STATIC int /* error */ +xfs_rtallocate_extent_exact( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_rtblock_t bno, /* starting block number to allocate */ + xfs_extlen_t minlen, /* minimum length to allocate */ + xfs_extlen_t maxlen, /* maximum length to allocate */ + xfs_extlen_t *len, /* out: actual length allocated */ + xfs_buf_t **rbpp, /* in/out: summary block buffer */ + xfs_fsblock_t *rsb, /* in/out: summary block number */ + xfs_extlen_t prod, /* extent product factor */ + xfs_rtblock_t *rtblock) /* out: start block allocated */ +{ + int error; /* error value */ + xfs_extlen_t i; /* extent length trimmed due to prod */ + int isfree; /* extent is free */ + xfs_rtblock_t next; /* next block to try (dummy) */ + + ASSERT(minlen % prod == 0 && maxlen % prod == 0); + /* + * Check if the range in question (for maxlen) is free. + */ + error = xfs_rtcheck_range(mp, tp, bno, maxlen, 1, &next, &isfree); + if (error) { + return error; + } + if (isfree) { + /* + * If it is, allocate it and return success. + */ + error = xfs_rtallocate_range(mp, tp, bno, maxlen, rbpp, rsb); + if (error) { + return error; + } + *len = maxlen; + *rtblock = bno; + return 0; + } + /* + * If not, allocate what there is, if it's at least minlen. + */ + maxlen = next - bno; + if (maxlen < minlen) { + /* + * Failed, return failure status. + */ + *rtblock = NULLRTBLOCK; + return 0; + } + /* + * Trim off tail of extent, if prod is specified. + */ + if (prod > 1 && (i = maxlen % prod)) { + maxlen -= i; + if (maxlen < minlen) { + /* + * Now we can't do it, return failure status. + */ + *rtblock = NULLRTBLOCK; + return 0; + } + } + /* + * Allocate what we can and return it. + */ + error = xfs_rtallocate_range(mp, tp, bno, maxlen, rbpp, rsb); + if (error) { + return error; + } + *len = maxlen; + *rtblock = bno; + return 0; +} + +/* + * Allocate an extent of length minlen<=len<=maxlen, starting as near + * to bno as possible. If we don't get maxlen then use prod to trim + * the length, if given. The lengths are all in rtextents. + */ +STATIC int /* error */ +xfs_rtallocate_extent_near( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_rtblock_t bno, /* starting block number to allocate */ + xfs_extlen_t minlen, /* minimum length to allocate */ + xfs_extlen_t maxlen, /* maximum length to allocate */ + xfs_extlen_t *len, /* out: actual length allocated */ + xfs_buf_t **rbpp, /* in/out: summary block buffer */ + xfs_fsblock_t *rsb, /* in/out: summary block number */ + xfs_extlen_t prod, /* extent product factor */ + xfs_rtblock_t *rtblock) /* out: start block allocated */ +{ + int any; /* any useful extents from summary */ + xfs_rtblock_t bbno; /* bitmap block number */ + int error; /* error value */ + int i; /* bitmap block offset (loop control) */ + int j; /* secondary loop control */ + int log2len; /* log2 of minlen */ + xfs_rtblock_t n; /* next block to try */ + xfs_rtblock_t r; /* result block */ + + ASSERT(minlen % prod == 0 && maxlen % prod == 0); + /* + * If the block number given is off the end, silently set it to + * the last block. + */ + if (bno >= mp->m_sb.sb_rextents) + bno = mp->m_sb.sb_rextents - 1; + /* + * Try the exact allocation first. + */ + error = xfs_rtallocate_extent_exact(mp, tp, bno, minlen, maxlen, len, + rbpp, rsb, prod, &r); + if (error) { + return error; + } + /* + * If the exact allocation worked, return that. + */ + if (r != NULLRTBLOCK) { + *rtblock = r; + return 0; + } + bbno = XFS_BITTOBLOCK(mp, bno); + i = 0; + ASSERT(minlen != 0); + log2len = xfs_highbit32(minlen); + /* + * Loop over all bitmap blocks (bbno + i is current block). + */ + for (;;) { + /* + * Get summary information of extents of all useful levels + * starting in this bitmap block. + */ + error = xfs_rtany_summary(mp, tp, log2len, mp->m_rsumlevels - 1, + bbno + i, rbpp, rsb, &any); + if (error) { + return error; + } + /* + * If there are any useful extents starting here, try + * allocating one. + */ + if (any) { + /* + * On the positive side of the starting location. + */ + if (i >= 0) { + /* + * Try to allocate an extent starting in + * this block. + */ + error = xfs_rtallocate_extent_block(mp, tp, + bbno + i, minlen, maxlen, len, &n, rbpp, + rsb, prod, &r); + if (error) { + return error; + } + /* + * If it worked, return it. + */ + if (r != NULLRTBLOCK) { + *rtblock = r; + return 0; + } + } + /* + * On the negative side of the starting location. + */ + else { /* i < 0 */ + /* + * Loop backwards through the bitmap blocks from + * the starting point-1 up to where we are now. + * There should be an extent which ends in this + * bitmap block and is long enough. + */ + for (j = -1; j > i; j--) { + /* + * Grab the summary information for + * this bitmap block. + */ + error = xfs_rtany_summary(mp, tp, + log2len, mp->m_rsumlevels - 1, + bbno + j, rbpp, rsb, &any); + if (error) { + return error; + } + /* + * If there's no extent given in the + * summary that means the extent we + * found must carry over from an + * earlier block. If there is an + * extent given, we've already tried + * that allocation, don't do it again. + */ + if (any) + continue; + error = xfs_rtallocate_extent_block(mp, + tp, bbno + j, minlen, maxlen, + len, &n, rbpp, rsb, prod, &r); + if (error) { + return error; + } + /* + * If it works, return the extent. + */ + if (r != NULLRTBLOCK) { + *rtblock = r; + return 0; + } + } + /* + * There weren't intervening bitmap blocks + * with a long enough extent, or the + * allocation didn't work for some reason + * (i.e. it's a little * too short). + * Try to allocate from the summary block + * that we found. + */ + error = xfs_rtallocate_extent_block(mp, tp, + bbno + i, minlen, maxlen, len, &n, rbpp, + rsb, prod, &r); + if (error) { + return error; + } + /* + * If it works, return the extent. + */ + if (r != NULLRTBLOCK) { + *rtblock = r; + return 0; + } + } + } + /* + * Loop control. If we were on the positive side, and there's + * still more blocks on the negative side, go there. + */ + if (i > 0 && (int)bbno - i >= 0) + i = -i; + /* + * If positive, and no more negative, but there are more + * positive, go there. + */ + else if (i > 0 && (int)bbno + i < mp->m_sb.sb_rbmblocks - 1) + i++; + /* + * If negative or 0 (just started), and there are positive + * blocks to go, go there. The 0 case moves to block 1. + */ + else if (i <= 0 && (int)bbno - i < mp->m_sb.sb_rbmblocks - 1) + i = 1 - i; + /* + * If negative or 0 and there are more negative blocks, + * go there. + */ + else if (i <= 0 && (int)bbno + i > 0) + i--; + /* + * Must be done. Return failure. + */ + else + break; + } + *rtblock = NULLRTBLOCK; + return 0; +} + +/* + * Allocate an extent of length minlen<=len<=maxlen, with no position + * specified. If we don't get maxlen then use prod to trim + * the length, if given. The lengths are all in rtextents. + */ +STATIC int /* error */ +xfs_rtallocate_extent_size( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_extlen_t minlen, /* minimum length to allocate */ + xfs_extlen_t maxlen, /* maximum length to allocate */ + xfs_extlen_t *len, /* out: actual length allocated */ + xfs_buf_t **rbpp, /* in/out: summary block buffer */ + xfs_fsblock_t *rsb, /* in/out: summary block number */ + xfs_extlen_t prod, /* extent product factor */ + xfs_rtblock_t *rtblock) /* out: start block allocated */ +{ + int error; /* error value */ + int i; /* bitmap block number */ + int l; /* level number (loop control) */ + xfs_rtblock_t n; /* next block to be tried */ + xfs_rtblock_t r; /* result block number */ + xfs_suminfo_t sum; /* summary information for extents */ + + ASSERT(minlen % prod == 0 && maxlen % prod == 0); + ASSERT(maxlen != 0); + + /* + * Loop over all the levels starting with maxlen. + * At each level, look at all the bitmap blocks, to see if there + * are extents starting there that are long enough (>= maxlen). + * Note, only on the initial level can the allocation fail if + * the summary says there's an extent. + */ + for (l = xfs_highbit32(maxlen); l < mp->m_rsumlevels; l++) { + /* + * Loop over all the bitmap blocks. + */ + for (i = 0; i < mp->m_sb.sb_rbmblocks; i++) { + /* + * Get the summary for this level/block. + */ + error = xfs_rtget_summary(mp, tp, l, i, rbpp, rsb, + &sum); + if (error) { + return error; + } + /* + * Nothing there, on to the next block. + */ + if (!sum) + continue; + /* + * Try allocating the extent. + */ + error = xfs_rtallocate_extent_block(mp, tp, i, maxlen, + maxlen, len, &n, rbpp, rsb, prod, &r); + if (error) { + return error; + } + /* + * If it worked, return that. + */ + if (r != NULLRTBLOCK) { + *rtblock = r; + return 0; + } + /* + * If the "next block to try" returned from the + * allocator is beyond the next bitmap block, + * skip to that bitmap block. + */ + if (XFS_BITTOBLOCK(mp, n) > i + 1) + i = XFS_BITTOBLOCK(mp, n) - 1; + } + } + /* + * Didn't find any maxlen blocks. Try smaller ones, unless + * we're asking for a fixed size extent. + */ + if (minlen > --maxlen) { + *rtblock = NULLRTBLOCK; + return 0; + } + ASSERT(minlen != 0); + ASSERT(maxlen != 0); + + /* + * Loop over sizes, from maxlen down to minlen. + * This time, when we do the allocations, allow smaller ones + * to succeed. + */ + for (l = xfs_highbit32(maxlen); l >= xfs_highbit32(minlen); l--) { + /* + * Loop over all the bitmap blocks, try an allocation + * starting in that block. + */ + for (i = 0; i < mp->m_sb.sb_rbmblocks; i++) { + /* + * Get the summary information for this level/block. + */ + error = xfs_rtget_summary(mp, tp, l, i, rbpp, rsb, + &sum); + if (error) { + return error; + } + /* + * If nothing there, go on to next. + */ + if (!sum) + continue; + /* + * Try the allocation. Make sure the specified + * minlen/maxlen are in the possible range for + * this summary level. + */ + error = xfs_rtallocate_extent_block(mp, tp, i, + XFS_RTMAX(minlen, 1 << l), + XFS_RTMIN(maxlen, (1 << (l + 1)) - 1), + len, &n, rbpp, rsb, prod, &r); + if (error) { + return error; + } + /* + * If it worked, return that extent. + */ + if (r != NULLRTBLOCK) { + *rtblock = r; + return 0; + } + /* + * If the "next block to try" returned from the + * allocator is beyond the next bitmap block, + * skip to that bitmap block. + */ + if (XFS_BITTOBLOCK(mp, n) > i + 1) + i = XFS_BITTOBLOCK(mp, n) - 1; + } + } + /* + * Got nothing, return failure. + */ + *rtblock = NULLRTBLOCK; + return 0; +} + +/* + * Allocate space to the bitmap or summary file, and zero it, for growfs. + */ +STATIC int /* error */ +xfs_growfs_rt_alloc( + xfs_mount_t *mp, /* file system mount point */ + xfs_extlen_t oblocks, /* old count of blocks */ + xfs_extlen_t nblocks, /* new count of blocks */ + xfs_inode_t *ip) /* inode (bitmap/summary) */ +{ + xfs_fileoff_t bno; /* block number in file */ + xfs_buf_t *bp; /* temporary buffer for zeroing */ + int committed; /* transaction committed flag */ + xfs_daddr_t d; /* disk block address */ + int error; /* error return value */ + xfs_fsblock_t firstblock; /* first block allocated in xaction */ + xfs_bmap_free_t flist; /* list of freed blocks */ + xfs_fsblock_t fsbno; /* filesystem block for bno */ + xfs_bmbt_irec_t map; /* block map output */ + int nmap; /* number of block maps */ + int resblks; /* space reservation */ + + /* + * Allocate space to the file, as necessary. + */ + while (oblocks < nblocks) { + int cancelflags = 0; + xfs_trans_t *tp; + + tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ALLOC); + resblks = XFS_GROWFSRT_SPACE_RES(mp, nblocks - oblocks); + /* + * Reserve space & log for one extent added to the file. + */ + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtalloc, + resblks, 0); + if (error) + goto error_cancel; + cancelflags = XFS_TRANS_RELEASE_LOG_RES; + /* + * Lock the inode. + */ + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + + xfs_bmap_init(&flist, &firstblock); + /* + * Allocate blocks to the bitmap file. + */ + nmap = 1; + cancelflags |= XFS_TRANS_ABORT; + error = xfs_bmapi_write(tp, ip, oblocks, nblocks - oblocks, + XFS_BMAPI_METADATA, &firstblock, + resblks, &map, &nmap, &flist); + if (!error && nmap < 1) + error = -ENOSPC; + if (error) + goto error_cancel; + /* + * Free any blocks freed up in the transaction, then commit. + */ + error = xfs_bmap_finish(&tp, &flist, &committed); + if (error) + goto error_cancel; + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + if (error) + goto error; + /* + * Now we need to clear the allocated blocks. + * Do this one block per transaction, to keep it simple. + */ + cancelflags = 0; + for (bno = map.br_startoff, fsbno = map.br_startblock; + bno < map.br_startoff + map.br_blockcount; + bno++, fsbno++) { + tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ZERO); + /* + * Reserve log for one block zeroing. + */ + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtzero, + 0, 0); + if (error) + goto error_cancel; + /* + * Lock the bitmap inode. + */ + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + /* + * Get a buffer for the block. + */ + d = XFS_FSB_TO_DADDR(mp, fsbno); + bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, + mp->m_bsize, 0); + if (bp == NULL) { + error = -EIO; +error_cancel: + xfs_trans_cancel(tp, cancelflags); + goto error; + } + memset(bp->b_addr, 0, mp->m_sb.sb_blocksize); + xfs_trans_log_buf(tp, bp, 0, mp->m_sb.sb_blocksize - 1); + /* + * Commit the transaction. + */ + error = xfs_trans_commit(tp, 0); + if (error) + goto error; + } + /* + * Go on to the next extent, if any. + */ + oblocks = map.br_startoff + map.br_blockcount; + } + return 0; + +error: + return error; +} + +/* + * Visible (exported) functions. + */ + +/* + * Grow the realtime area of the filesystem. + */ +int +xfs_growfs_rt( + xfs_mount_t *mp, /* mount point for filesystem */ + xfs_growfs_rt_t *in) /* growfs rt input struct */ +{ + xfs_rtblock_t bmbno; /* bitmap block number */ + xfs_buf_t *bp; /* temporary buffer */ + int error; /* error return value */ + xfs_mount_t *nmp; /* new (fake) mount structure */ + xfs_rfsblock_t nrblocks; /* new number of realtime blocks */ + xfs_extlen_t nrbmblocks; /* new number of rt bitmap blocks */ + xfs_rtblock_t nrextents; /* new number of realtime extents */ + uint8_t nrextslog; /* new log2 of sb_rextents */ + xfs_extlen_t nrsumblocks; /* new number of summary blocks */ + uint nrsumlevels; /* new rt summary levels */ + uint nrsumsize; /* new size of rt summary, bytes */ + xfs_sb_t *nsbp; /* new superblock */ + xfs_extlen_t rbmblocks; /* current number of rt bitmap blocks */ + xfs_extlen_t rsumblocks; /* current number of rt summary blks */ + xfs_sb_t *sbp; /* old superblock */ + xfs_fsblock_t sumbno; /* summary block number */ + + sbp = &mp->m_sb; + /* + * Initial error checking. + */ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (mp->m_rtdev_targp == NULL || mp->m_rbmip == NULL || + (nrblocks = in->newblocks) <= sbp->sb_rblocks || + (sbp->sb_rblocks && (in->extsize != sbp->sb_rextsize))) + return -EINVAL; + if ((error = xfs_sb_validate_fsb_count(sbp, nrblocks))) + return error; + /* + * Read in the last block of the device, make sure it exists. + */ + error = xfs_buf_read_uncached(mp->m_rtdev_targp, + XFS_FSB_TO_BB(mp, nrblocks - 1), + XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL); + if (error) + return error; + xfs_buf_relse(bp); + + /* + * Calculate new parameters. These are the final values to be reached. + */ + nrextents = nrblocks; + do_div(nrextents, in->extsize); + nrbmblocks = howmany_64(nrextents, NBBY * sbp->sb_blocksize); + nrextslog = xfs_highbit32(nrextents); + nrsumlevels = nrextslog + 1; + nrsumsize = (uint)sizeof(xfs_suminfo_t) * nrsumlevels * nrbmblocks; + nrsumblocks = XFS_B_TO_FSB(mp, nrsumsize); + nrsumsize = XFS_FSB_TO_B(mp, nrsumblocks); + /* + * New summary size can't be more than half the size of + * the log. This prevents us from getting a log overflow, + * since we'll log basically the whole summary file at once. + */ + if (nrsumblocks > (mp->m_sb.sb_logblocks >> 1)) + return -EINVAL; + /* + * Get the old block counts for bitmap and summary inodes. + * These can't change since other growfs callers are locked out. + */ + rbmblocks = XFS_B_TO_FSB(mp, mp->m_rbmip->i_d.di_size); + rsumblocks = XFS_B_TO_FSB(mp, mp->m_rsumip->i_d.di_size); + /* + * Allocate space to the bitmap and summary files, as necessary. + */ + error = xfs_growfs_rt_alloc(mp, rbmblocks, nrbmblocks, mp->m_rbmip); + if (error) + return error; + error = xfs_growfs_rt_alloc(mp, rsumblocks, nrsumblocks, mp->m_rsumip); + if (error) + return error; + /* + * Allocate a new (fake) mount/sb. + */ + nmp = kmem_alloc(sizeof(*nmp), KM_SLEEP); + /* + * Loop over the bitmap blocks. + * We will do everything one bitmap block at a time. + * Skip the current block if it is exactly full. + * This also deals with the case where there were no rtextents before. + */ + for (bmbno = sbp->sb_rbmblocks - + ((sbp->sb_rextents & ((1 << mp->m_blkbit_log) - 1)) != 0); + bmbno < nrbmblocks; + bmbno++) { + xfs_trans_t *tp; + int cancelflags = 0; + + *nmp = *mp; + nsbp = &nmp->m_sb; + /* + * Calculate new sb and mount fields for this round. + */ + nsbp->sb_rextsize = in->extsize; + nsbp->sb_rbmblocks = bmbno + 1; + nsbp->sb_rblocks = + XFS_RTMIN(nrblocks, + nsbp->sb_rbmblocks * NBBY * + nsbp->sb_blocksize * nsbp->sb_rextsize); + nsbp->sb_rextents = nsbp->sb_rblocks; + do_div(nsbp->sb_rextents, nsbp->sb_rextsize); + ASSERT(nsbp->sb_rextents != 0); + nsbp->sb_rextslog = xfs_highbit32(nsbp->sb_rextents); + nrsumlevels = nmp->m_rsumlevels = nsbp->sb_rextslog + 1; + nrsumsize = + (uint)sizeof(xfs_suminfo_t) * nrsumlevels * + nsbp->sb_rbmblocks; + nrsumblocks = XFS_B_TO_FSB(mp, nrsumsize); + nmp->m_rsumsize = nrsumsize = XFS_FSB_TO_B(mp, nrsumblocks); + /* + * Start a transaction, get the log reservation. + */ + tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_FREE); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtfree, + 0, 0); + if (error) + goto error_cancel; + /* + * Lock out other callers by grabbing the bitmap inode lock. + */ + xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL); + /* + * Update the bitmap inode's size. + */ + mp->m_rbmip->i_d.di_size = + nsbp->sb_rbmblocks * nsbp->sb_blocksize; + xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE); + cancelflags |= XFS_TRANS_ABORT; + /* + * Get the summary inode into the transaction. + */ + xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, mp->m_rsumip, XFS_ILOCK_EXCL); + /* + * Update the summary inode's size. + */ + mp->m_rsumip->i_d.di_size = nmp->m_rsumsize; + xfs_trans_log_inode(tp, mp->m_rsumip, XFS_ILOG_CORE); + /* + * Copy summary data from old to new sizes. + * Do this when the real size (not block-aligned) changes. + */ + if (sbp->sb_rbmblocks != nsbp->sb_rbmblocks || + mp->m_rsumlevels != nmp->m_rsumlevels) { + error = xfs_rtcopy_summary(mp, nmp, tp); + if (error) + goto error_cancel; + } + /* + * Update superblock fields. + */ + if (nsbp->sb_rextsize != sbp->sb_rextsize) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTSIZE, + nsbp->sb_rextsize - sbp->sb_rextsize); + if (nsbp->sb_rbmblocks != sbp->sb_rbmblocks) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_RBMBLOCKS, + nsbp->sb_rbmblocks - sbp->sb_rbmblocks); + if (nsbp->sb_rblocks != sbp->sb_rblocks) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_RBLOCKS, + nsbp->sb_rblocks - sbp->sb_rblocks); + if (nsbp->sb_rextents != sbp->sb_rextents) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTENTS, + nsbp->sb_rextents - sbp->sb_rextents); + if (nsbp->sb_rextslog != sbp->sb_rextslog) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTSLOG, + nsbp->sb_rextslog - sbp->sb_rextslog); + /* + * Free new extent. + */ + bp = NULL; + error = xfs_rtfree_range(nmp, tp, sbp->sb_rextents, + nsbp->sb_rextents - sbp->sb_rextents, &bp, &sumbno); + if (error) { +error_cancel: + xfs_trans_cancel(tp, cancelflags); + break; + } + /* + * Mark more blocks free in the superblock. + */ + xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, + nsbp->sb_rextents - sbp->sb_rextents); + /* + * Update mp values into the real mp structure. + */ + mp->m_rsumlevels = nrsumlevels; + mp->m_rsumsize = nrsumsize; + + error = xfs_trans_commit(tp, 0); + if (error) + break; + } + + /* + * Free the fake mp structure. + */ + kmem_free(nmp); + + return error; +} + +/* + * Allocate an extent in the realtime subvolume, with the usual allocation + * parameters. The length units are all in realtime extents, as is the + * result block number. + */ +int /* error */ +xfs_rtallocate_extent( + xfs_trans_t *tp, /* transaction pointer */ + xfs_rtblock_t bno, /* starting block number to allocate */ + xfs_extlen_t minlen, /* minimum length to allocate */ + xfs_extlen_t maxlen, /* maximum length to allocate */ + xfs_extlen_t *len, /* out: actual length allocated */ + xfs_alloctype_t type, /* allocation type XFS_ALLOCTYPE... */ + int wasdel, /* was a delayed allocation extent */ + xfs_extlen_t prod, /* extent product factor */ + xfs_rtblock_t *rtblock) /* out: start block allocated */ +{ + xfs_mount_t *mp = tp->t_mountp; + int error; /* error value */ + xfs_rtblock_t r; /* result allocated block */ + xfs_fsblock_t sb; /* summary file block number */ + xfs_buf_t *sumbp; /* summary file block buffer */ + + ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL)); + ASSERT(minlen > 0 && minlen <= maxlen); + + /* + * If prod is set then figure out what to do to minlen and maxlen. + */ + if (prod > 1) { + xfs_extlen_t i; + + if ((i = maxlen % prod)) + maxlen -= i; + if ((i = minlen % prod)) + minlen += prod - i; + if (maxlen < minlen) { + *rtblock = NULLRTBLOCK; + return 0; + } + } + + sumbp = NULL; + /* + * Allocate by size, or near another block, or exactly at some block. + */ + switch (type) { + case XFS_ALLOCTYPE_ANY_AG: + error = xfs_rtallocate_extent_size(mp, tp, minlen, maxlen, len, + &sumbp, &sb, prod, &r); + break; + case XFS_ALLOCTYPE_NEAR_BNO: + error = xfs_rtallocate_extent_near(mp, tp, bno, minlen, maxlen, + len, &sumbp, &sb, prod, &r); + break; + case XFS_ALLOCTYPE_THIS_BNO: + error = xfs_rtallocate_extent_exact(mp, tp, bno, minlen, maxlen, + len, &sumbp, &sb, prod, &r); + break; + default: + error = -EIO; + ASSERT(0); + } + if (error) + return error; + + /* + * If it worked, update the superblock. + */ + if (r != NULLRTBLOCK) { + long slen = (long)*len; + + ASSERT(*len >= minlen && *len <= maxlen); + if (wasdel) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_RES_FREXTENTS, -slen); + else + xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, -slen); + } + *rtblock = r; + return 0; +} + +/* + * Initialize realtime fields in the mount structure. + */ +int /* error */ +xfs_rtmount_init( + struct xfs_mount *mp) /* file system mount structure */ +{ + struct xfs_buf *bp; /* buffer for last block of subvolume */ + struct xfs_sb *sbp; /* filesystem superblock copy in mount */ + xfs_daddr_t d; /* address of last block of subvolume */ + int error; + + sbp = &mp->m_sb; + if (sbp->sb_rblocks == 0) + return 0; + if (mp->m_rtdev_targp == NULL) { + xfs_warn(mp, + "Filesystem has a realtime volume, use rtdev=device option"); + return -ENODEV; + } + mp->m_rsumlevels = sbp->sb_rextslog + 1; + mp->m_rsumsize = + (uint)sizeof(xfs_suminfo_t) * mp->m_rsumlevels * + sbp->sb_rbmblocks; + mp->m_rsumsize = roundup(mp->m_rsumsize, sbp->sb_blocksize); + mp->m_rbmip = mp->m_rsumip = NULL; + /* + * Check that the realtime section is an ok size. + */ + d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks); + if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_rblocks) { + xfs_warn(mp, "realtime mount -- %llu != %llu", + (unsigned long long) XFS_BB_TO_FSB(mp, d), + (unsigned long long) mp->m_sb.sb_rblocks); + return -EFBIG; + } + error = xfs_buf_read_uncached(mp->m_rtdev_targp, + d - XFS_FSB_TO_BB(mp, 1), + XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL); + if (error) { + xfs_warn(mp, "realtime device size check failed"); + return error; + } + xfs_buf_relse(bp); + return 0; +} + +/* + * Get the bitmap and summary inodes into the mount structure + * at mount time. + */ +int /* error */ +xfs_rtmount_inodes( + xfs_mount_t *mp) /* file system mount structure */ +{ + int error; /* error return value */ + xfs_sb_t *sbp; + + sbp = &mp->m_sb; + if (sbp->sb_rbmino == NULLFSINO) + return 0; + error = xfs_iget(mp, NULL, sbp->sb_rbmino, 0, 0, &mp->m_rbmip); + if (error) + return error; + ASSERT(mp->m_rbmip != NULL); + ASSERT(sbp->sb_rsumino != NULLFSINO); + error = xfs_iget(mp, NULL, sbp->sb_rsumino, 0, 0, &mp->m_rsumip); + if (error) { + IRELE(mp->m_rbmip); + return error; + } + ASSERT(mp->m_rsumip != NULL); + return 0; +} + +void +xfs_rtunmount_inodes( + struct xfs_mount *mp) +{ + if (mp->m_rbmip) + IRELE(mp->m_rbmip); + if (mp->m_rsumip) + IRELE(mp->m_rsumip); +} + +/* + * Pick an extent for allocation at the start of a new realtime file. + * Use the sequence number stored in the atime field of the bitmap inode. + * Translate this to a fraction of the rtextents, and return the product + * of rtextents and the fraction. + * The fraction sequence is 0, 1/2, 1/4, 3/4, 1/8, ..., 7/8, 1/16, ... + */ +int /* error */ +xfs_rtpick_extent( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_extlen_t len, /* allocation length (rtextents) */ + xfs_rtblock_t *pick) /* result rt extent */ +{ + xfs_rtblock_t b; /* result block */ + int log2; /* log of sequence number */ + __uint64_t resid; /* residual after log removed */ + __uint64_t seq; /* sequence number of file creation */ + __uint64_t *seqp; /* pointer to seqno in inode */ + + ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL)); + + seqp = (__uint64_t *)&mp->m_rbmip->i_d.di_atime; + if (!(mp->m_rbmip->i_d.di_flags & XFS_DIFLAG_NEWRTBM)) { + mp->m_rbmip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM; + *seqp = 0; + } + seq = *seqp; + if ((log2 = xfs_highbit64(seq)) == -1) + b = 0; + else { + resid = seq - (1ULL << log2); + b = (mp->m_sb.sb_rextents * ((resid << 1) + 1ULL)) >> + (log2 + 1); + if (b >= mp->m_sb.sb_rextents) + b = do_mod(b, mp->m_sb.sb_rextents); + if (b + len > mp->m_sb.sb_rextents) + b = mp->m_sb.sb_rextents - len; + } + *seqp = seq + 1; + xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE); + *pick = b; + return 0; +} diff --git a/kernel/fs/xfs/xfs_rtalloc.h b/kernel/fs/xfs/xfs_rtalloc.h new file mode 100644 index 000000000..76c0a4a9b --- /dev/null +++ b/kernel/fs/xfs/xfs_rtalloc.h @@ -0,0 +1,145 @@ +/* + * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_RTALLOC_H__ +#define __XFS_RTALLOC_H__ + +/* kernel only definitions and functions */ + +struct xfs_mount; +struct xfs_trans; + +#ifdef CONFIG_XFS_RT +/* + * Function prototypes for exported functions. + */ + +/* + * Allocate an extent in the realtime subvolume, with the usual allocation + * parameters. The length units are all in realtime extents, as is the + * result block number. + */ +int /* error */ +xfs_rtallocate_extent( + struct xfs_trans *tp, /* transaction pointer */ + xfs_rtblock_t bno, /* starting block number to allocate */ + xfs_extlen_t minlen, /* minimum length to allocate */ + xfs_extlen_t maxlen, /* maximum length to allocate */ + xfs_extlen_t *len, /* out: actual length allocated */ + xfs_alloctype_t type, /* allocation type XFS_ALLOCTYPE... */ + int wasdel, /* was a delayed allocation extent */ + xfs_extlen_t prod, /* extent product factor */ + xfs_rtblock_t *rtblock); /* out: start block allocated */ + +/* + * Free an extent in the realtime subvolume. Length is expressed in + * realtime extents, as is the block number. + */ +int /* error */ +xfs_rtfree_extent( + struct xfs_trans *tp, /* transaction pointer */ + xfs_rtblock_t bno, /* starting block number to free */ + xfs_extlen_t len); /* length of extent freed */ + +/* + * Initialize realtime fields in the mount structure. + */ +int /* error */ +xfs_rtmount_init( + struct xfs_mount *mp); /* file system mount structure */ +void +xfs_rtunmount_inodes( + struct xfs_mount *mp); + +/* + * Get the bitmap and summary inodes into the mount structure + * at mount time. + */ +int /* error */ +xfs_rtmount_inodes( + struct xfs_mount *mp); /* file system mount structure */ + +/* + * Pick an extent for allocation at the start of a new realtime file. + * Use the sequence number stored in the atime field of the bitmap inode. + * Translate this to a fraction of the rtextents, and return the product + * of rtextents and the fraction. + * The fraction sequence is 0, 1/2, 1/4, 3/4, 1/8, ..., 7/8, 1/16, ... + */ +int /* error */ +xfs_rtpick_extent( + struct xfs_mount *mp, /* file system mount point */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_extlen_t len, /* allocation length (rtextents) */ + xfs_rtblock_t *pick); /* result rt extent */ + +/* + * Grow the realtime area of the filesystem. + */ +int +xfs_growfs_rt( + struct xfs_mount *mp, /* file system mount structure */ + xfs_growfs_rt_t *in); /* user supplied growfs struct */ + +/* + * From xfs_rtbitmap.c + */ +int xfs_rtbuf_get(struct xfs_mount *mp, struct xfs_trans *tp, + xfs_rtblock_t block, int issum, struct xfs_buf **bpp); +int xfs_rtcheck_range(struct xfs_mount *mp, struct xfs_trans *tp, + xfs_rtblock_t start, xfs_extlen_t len, int val, + xfs_rtblock_t *new, int *stat); +int xfs_rtfind_back(struct xfs_mount *mp, struct xfs_trans *tp, + xfs_rtblock_t start, xfs_rtblock_t limit, + xfs_rtblock_t *rtblock); +int xfs_rtfind_forw(struct xfs_mount *mp, struct xfs_trans *tp, + xfs_rtblock_t start, xfs_rtblock_t limit, + xfs_rtblock_t *rtblock); +int xfs_rtmodify_range(struct xfs_mount *mp, struct xfs_trans *tp, + xfs_rtblock_t start, xfs_extlen_t len, int val); +int xfs_rtmodify_summary_int(struct xfs_mount *mp, struct xfs_trans *tp, + int log, xfs_rtblock_t bbno, int delta, + xfs_buf_t **rbpp, xfs_fsblock_t *rsb, + xfs_suminfo_t *sum); +int xfs_rtmodify_summary(struct xfs_mount *mp, struct xfs_trans *tp, int log, + xfs_rtblock_t bbno, int delta, xfs_buf_t **rbpp, + xfs_fsblock_t *rsb); +int xfs_rtfree_range(struct xfs_mount *mp, struct xfs_trans *tp, + xfs_rtblock_t start, xfs_extlen_t len, + struct xfs_buf **rbpp, xfs_fsblock_t *rsb); + + +#else +# define xfs_rtallocate_extent(t,b,min,max,l,a,f,p,rb) (ENOSYS) +# define xfs_rtfree_extent(t,b,l) (ENOSYS) +# define xfs_rtpick_extent(m,t,l,rb) (ENOSYS) +# define xfs_growfs_rt(mp,in) (ENOSYS) +static inline int /* error */ +xfs_rtmount_init( + xfs_mount_t *mp) /* file system mount structure */ +{ + if (mp->m_sb.sb_rblocks == 0) + return 0; + + xfs_warn(mp, "Not built with CONFIG_XFS_RT"); + return -ENOSYS; +} +# define xfs_rtmount_inodes(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS)) +# define xfs_rtunmount_inodes(m) +#endif /* CONFIG_XFS_RT */ + +#endif /* __XFS_RTALLOC_H__ */ diff --git a/kernel/fs/xfs/xfs_stats.c b/kernel/fs/xfs/xfs_stats.c new file mode 100644 index 000000000..f2240383d --- /dev/null +++ b/kernel/fs/xfs/xfs_stats.c @@ -0,0 +1,198 @@ +/* + * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include + +DEFINE_PER_CPU(struct xfsstats, xfsstats); + +static int counter_val(int idx) +{ + int val = 0, cpu; + + for_each_possible_cpu(cpu) + val += *(((__u32 *)&per_cpu(xfsstats, cpu) + idx)); + return val; +} + +static int xfs_stat_proc_show(struct seq_file *m, void *v) +{ + int i, j; + __uint64_t xs_xstrat_bytes = 0; + __uint64_t xs_write_bytes = 0; + __uint64_t xs_read_bytes = 0; + + static const struct xstats_entry { + char *desc; + int endpoint; + } xstats[] = { + { "extent_alloc", XFSSTAT_END_EXTENT_ALLOC }, + { "abt", XFSSTAT_END_ALLOC_BTREE }, + { "blk_map", XFSSTAT_END_BLOCK_MAPPING }, + { "bmbt", XFSSTAT_END_BLOCK_MAP_BTREE }, + { "dir", XFSSTAT_END_DIRECTORY_OPS }, + { "trans", XFSSTAT_END_TRANSACTIONS }, + { "ig", XFSSTAT_END_INODE_OPS }, + { "log", XFSSTAT_END_LOG_OPS }, + { "push_ail", XFSSTAT_END_TAIL_PUSHING }, + { "xstrat", XFSSTAT_END_WRITE_CONVERT }, + { "rw", XFSSTAT_END_READ_WRITE_OPS }, + { "attr", XFSSTAT_END_ATTRIBUTE_OPS }, + { "icluster", XFSSTAT_END_INODE_CLUSTER }, + { "vnodes", XFSSTAT_END_VNODE_OPS }, + { "buf", XFSSTAT_END_BUF }, + { "abtb2", XFSSTAT_END_ABTB_V2 }, + { "abtc2", XFSSTAT_END_ABTC_V2 }, + { "bmbt2", XFSSTAT_END_BMBT_V2 }, + { "ibt2", XFSSTAT_END_IBT_V2 }, + { "fibt2", XFSSTAT_END_FIBT_V2 }, + /* we print both series of quota information together */ + { "qm", XFSSTAT_END_QM }, + }; + + /* Loop over all stats groups */ + for (i = j = 0; i < ARRAY_SIZE(xstats); i++) { + seq_printf(m, "%s", xstats[i].desc); + /* inner loop does each group */ + for (; j < xstats[i].endpoint; j++) + seq_printf(m, " %u", counter_val(j)); + seq_putc(m, '\n'); + } + /* extra precision counters */ + for_each_possible_cpu(i) { + xs_xstrat_bytes += per_cpu(xfsstats, i).xs_xstrat_bytes; + xs_write_bytes += per_cpu(xfsstats, i).xs_write_bytes; + xs_read_bytes += per_cpu(xfsstats, i).xs_read_bytes; + } + + seq_printf(m, "xpc %Lu %Lu %Lu\n", + xs_xstrat_bytes, xs_write_bytes, xs_read_bytes); + seq_printf(m, "debug %u\n", +#if defined(DEBUG) + 1); +#else + 0); +#endif + return 0; +} + +static int xfs_stat_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, xfs_stat_proc_show, NULL); +} + +static const struct file_operations xfs_stat_proc_fops = { + .owner = THIS_MODULE, + .open = xfs_stat_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +/* legacy quota interfaces */ +#ifdef CONFIG_XFS_QUOTA +static int xqm_proc_show(struct seq_file *m, void *v) +{ + /* maximum; incore; ratio free to inuse; freelist */ + seq_printf(m, "%d\t%d\t%d\t%u\n", + 0, + counter_val(XFSSTAT_END_XQMSTAT), + 0, + counter_val(XFSSTAT_END_XQMSTAT + 1)); + return 0; +} + +static int xqm_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, xqm_proc_show, NULL); +} + +static const struct file_operations xqm_proc_fops = { + .owner = THIS_MODULE, + .open = xqm_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +/* legacy quota stats interface no 2 */ +static int xqmstat_proc_show(struct seq_file *m, void *v) +{ + int j; + + seq_printf(m, "qm"); + for (j = XFSSTAT_END_IBT_V2; j < XFSSTAT_END_XQMSTAT; j++) + seq_printf(m, " %u", counter_val(j)); + seq_putc(m, '\n'); + return 0; +} + +static int xqmstat_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, xqmstat_proc_show, NULL); +} + +static const struct file_operations xqmstat_proc_fops = { + .owner = THIS_MODULE, + .open = xqmstat_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +#endif /* CONFIG_XFS_QUOTA */ + +int +xfs_init_procfs(void) +{ + if (!proc_mkdir("fs/xfs", NULL)) + goto out; + + if (!proc_create("fs/xfs/stat", 0, NULL, + &xfs_stat_proc_fops)) + goto out_remove_xfs_dir; +#ifdef CONFIG_XFS_QUOTA + if (!proc_create("fs/xfs/xqmstat", 0, NULL, + &xqmstat_proc_fops)) + goto out_remove_stat_file; + if (!proc_create("fs/xfs/xqm", 0, NULL, + &xqm_proc_fops)) + goto out_remove_xqmstat_file; +#endif + return 0; + +#ifdef CONFIG_XFS_QUOTA + out_remove_xqmstat_file: + remove_proc_entry("fs/xfs/xqmstat", NULL); + out_remove_stat_file: + remove_proc_entry("fs/xfs/stat", NULL); +#endif + out_remove_xfs_dir: + remove_proc_entry("fs/xfs", NULL); + out: + return -ENOMEM; +} + +void +xfs_cleanup_procfs(void) +{ +#ifdef CONFIG_XFS_QUOTA + remove_proc_entry("fs/xfs/xqm", NULL); + remove_proc_entry("fs/xfs/xqmstat", NULL); +#endif + remove_proc_entry("fs/xfs/stat", NULL); + remove_proc_entry("fs/xfs", NULL); +} diff --git a/kernel/fs/xfs/xfs_stats.h b/kernel/fs/xfs/xfs_stats.h new file mode 100644 index 000000000..c8f238b82 --- /dev/null +++ b/kernel/fs/xfs/xfs_stats.h @@ -0,0 +1,249 @@ +/* + * Copyright (c) 2000,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_STATS_H__ +#define __XFS_STATS_H__ + + +#if defined(CONFIG_PROC_FS) && !defined(XFS_STATS_OFF) + +#include + +/* + * XFS global statistics + */ +struct xfsstats { +# define XFSSTAT_END_EXTENT_ALLOC 4 + __uint32_t xs_allocx; + __uint32_t xs_allocb; + __uint32_t xs_freex; + __uint32_t xs_freeb; +# define XFSSTAT_END_ALLOC_BTREE (XFSSTAT_END_EXTENT_ALLOC+4) + __uint32_t xs_abt_lookup; + __uint32_t xs_abt_compare; + __uint32_t xs_abt_insrec; + __uint32_t xs_abt_delrec; +# define XFSSTAT_END_BLOCK_MAPPING (XFSSTAT_END_ALLOC_BTREE+7) + __uint32_t xs_blk_mapr; + __uint32_t xs_blk_mapw; + __uint32_t xs_blk_unmap; + __uint32_t xs_add_exlist; + __uint32_t xs_del_exlist; + __uint32_t xs_look_exlist; + __uint32_t xs_cmp_exlist; +# define XFSSTAT_END_BLOCK_MAP_BTREE (XFSSTAT_END_BLOCK_MAPPING+4) + __uint32_t xs_bmbt_lookup; + __uint32_t xs_bmbt_compare; + __uint32_t xs_bmbt_insrec; + __uint32_t xs_bmbt_delrec; +# define XFSSTAT_END_DIRECTORY_OPS (XFSSTAT_END_BLOCK_MAP_BTREE+4) + __uint32_t xs_dir_lookup; + __uint32_t xs_dir_create; + __uint32_t xs_dir_remove; + __uint32_t xs_dir_getdents; +# define XFSSTAT_END_TRANSACTIONS (XFSSTAT_END_DIRECTORY_OPS+3) + __uint32_t xs_trans_sync; + __uint32_t xs_trans_async; + __uint32_t xs_trans_empty; +# define XFSSTAT_END_INODE_OPS (XFSSTAT_END_TRANSACTIONS+7) + __uint32_t xs_ig_attempts; + __uint32_t xs_ig_found; + __uint32_t xs_ig_frecycle; + __uint32_t xs_ig_missed; + __uint32_t xs_ig_dup; + __uint32_t xs_ig_reclaims; + __uint32_t xs_ig_attrchg; +# define XFSSTAT_END_LOG_OPS (XFSSTAT_END_INODE_OPS+5) + __uint32_t xs_log_writes; + __uint32_t xs_log_blocks; + __uint32_t xs_log_noiclogs; + __uint32_t xs_log_force; + __uint32_t xs_log_force_sleep; +# define XFSSTAT_END_TAIL_PUSHING (XFSSTAT_END_LOG_OPS+10) + __uint32_t xs_try_logspace; + __uint32_t xs_sleep_logspace; + __uint32_t xs_push_ail; + __uint32_t xs_push_ail_success; + __uint32_t xs_push_ail_pushbuf; + __uint32_t xs_push_ail_pinned; + __uint32_t xs_push_ail_locked; + __uint32_t xs_push_ail_flushing; + __uint32_t xs_push_ail_restarts; + __uint32_t xs_push_ail_flush; +# define XFSSTAT_END_WRITE_CONVERT (XFSSTAT_END_TAIL_PUSHING+2) + __uint32_t xs_xstrat_quick; + __uint32_t xs_xstrat_split; +# define XFSSTAT_END_READ_WRITE_OPS (XFSSTAT_END_WRITE_CONVERT+2) + __uint32_t xs_write_calls; + __uint32_t xs_read_calls; +# define XFSSTAT_END_ATTRIBUTE_OPS (XFSSTAT_END_READ_WRITE_OPS+4) + __uint32_t xs_attr_get; + __uint32_t xs_attr_set; + __uint32_t xs_attr_remove; + __uint32_t xs_attr_list; +# define XFSSTAT_END_INODE_CLUSTER (XFSSTAT_END_ATTRIBUTE_OPS+3) + __uint32_t xs_iflush_count; + __uint32_t xs_icluster_flushcnt; + __uint32_t xs_icluster_flushinode; +# define XFSSTAT_END_VNODE_OPS (XFSSTAT_END_INODE_CLUSTER+8) + __uint32_t vn_active; /* # vnodes not on free lists */ + __uint32_t vn_alloc; /* # times vn_alloc called */ + __uint32_t vn_get; /* # times vn_get called */ + __uint32_t vn_hold; /* # times vn_hold called */ + __uint32_t vn_rele; /* # times vn_rele called */ + __uint32_t vn_reclaim; /* # times vn_reclaim called */ + __uint32_t vn_remove; /* # times vn_remove called */ + __uint32_t vn_free; /* # times vn_free called */ +#define XFSSTAT_END_BUF (XFSSTAT_END_VNODE_OPS+9) + __uint32_t xb_get; + __uint32_t xb_create; + __uint32_t xb_get_locked; + __uint32_t xb_get_locked_waited; + __uint32_t xb_busy_locked; + __uint32_t xb_miss_locked; + __uint32_t xb_page_retries; + __uint32_t xb_page_found; + __uint32_t xb_get_read; +/* Version 2 btree counters */ +#define XFSSTAT_END_ABTB_V2 (XFSSTAT_END_BUF+15) + __uint32_t xs_abtb_2_lookup; + __uint32_t xs_abtb_2_compare; + __uint32_t xs_abtb_2_insrec; + __uint32_t xs_abtb_2_delrec; + __uint32_t xs_abtb_2_newroot; + __uint32_t xs_abtb_2_killroot; + __uint32_t xs_abtb_2_increment; + __uint32_t xs_abtb_2_decrement; + __uint32_t xs_abtb_2_lshift; + __uint32_t xs_abtb_2_rshift; + __uint32_t xs_abtb_2_split; + __uint32_t xs_abtb_2_join; + __uint32_t xs_abtb_2_alloc; + __uint32_t xs_abtb_2_free; + __uint32_t xs_abtb_2_moves; +#define XFSSTAT_END_ABTC_V2 (XFSSTAT_END_ABTB_V2+15) + __uint32_t xs_abtc_2_lookup; + __uint32_t xs_abtc_2_compare; + __uint32_t xs_abtc_2_insrec; + __uint32_t xs_abtc_2_delrec; + __uint32_t xs_abtc_2_newroot; + __uint32_t xs_abtc_2_killroot; + __uint32_t xs_abtc_2_increment; + __uint32_t xs_abtc_2_decrement; + __uint32_t xs_abtc_2_lshift; + __uint32_t xs_abtc_2_rshift; + __uint32_t xs_abtc_2_split; + __uint32_t xs_abtc_2_join; + __uint32_t xs_abtc_2_alloc; + __uint32_t xs_abtc_2_free; + __uint32_t xs_abtc_2_moves; +#define XFSSTAT_END_BMBT_V2 (XFSSTAT_END_ABTC_V2+15) + __uint32_t xs_bmbt_2_lookup; + __uint32_t xs_bmbt_2_compare; + __uint32_t xs_bmbt_2_insrec; + __uint32_t xs_bmbt_2_delrec; + __uint32_t xs_bmbt_2_newroot; + __uint32_t xs_bmbt_2_killroot; + __uint32_t xs_bmbt_2_increment; + __uint32_t xs_bmbt_2_decrement; + __uint32_t xs_bmbt_2_lshift; + __uint32_t xs_bmbt_2_rshift; + __uint32_t xs_bmbt_2_split; + __uint32_t xs_bmbt_2_join; + __uint32_t xs_bmbt_2_alloc; + __uint32_t xs_bmbt_2_free; + __uint32_t xs_bmbt_2_moves; +#define XFSSTAT_END_IBT_V2 (XFSSTAT_END_BMBT_V2+15) + __uint32_t xs_ibt_2_lookup; + __uint32_t xs_ibt_2_compare; + __uint32_t xs_ibt_2_insrec; + __uint32_t xs_ibt_2_delrec; + __uint32_t xs_ibt_2_newroot; + __uint32_t xs_ibt_2_killroot; + __uint32_t xs_ibt_2_increment; + __uint32_t xs_ibt_2_decrement; + __uint32_t xs_ibt_2_lshift; + __uint32_t xs_ibt_2_rshift; + __uint32_t xs_ibt_2_split; + __uint32_t xs_ibt_2_join; + __uint32_t xs_ibt_2_alloc; + __uint32_t xs_ibt_2_free; + __uint32_t xs_ibt_2_moves; +#define XFSSTAT_END_FIBT_V2 (XFSSTAT_END_IBT_V2+15) + __uint32_t xs_fibt_2_lookup; + __uint32_t xs_fibt_2_compare; + __uint32_t xs_fibt_2_insrec; + __uint32_t xs_fibt_2_delrec; + __uint32_t xs_fibt_2_newroot; + __uint32_t xs_fibt_2_killroot; + __uint32_t xs_fibt_2_increment; + __uint32_t xs_fibt_2_decrement; + __uint32_t xs_fibt_2_lshift; + __uint32_t xs_fibt_2_rshift; + __uint32_t xs_fibt_2_split; + __uint32_t xs_fibt_2_join; + __uint32_t xs_fibt_2_alloc; + __uint32_t xs_fibt_2_free; + __uint32_t xs_fibt_2_moves; +#define XFSSTAT_END_XQMSTAT (XFSSTAT_END_FIBT_V2+6) + __uint32_t xs_qm_dqreclaims; + __uint32_t xs_qm_dqreclaim_misses; + __uint32_t xs_qm_dquot_dups; + __uint32_t xs_qm_dqcachemisses; + __uint32_t xs_qm_dqcachehits; + __uint32_t xs_qm_dqwants; +#define XFSSTAT_END_QM (XFSSTAT_END_XQMSTAT+2) + __uint32_t xs_qm_dquot; + __uint32_t xs_qm_dquot_unused; +/* Extra precision counters */ + __uint64_t xs_xstrat_bytes; + __uint64_t xs_write_bytes; + __uint64_t xs_read_bytes; +}; + +DECLARE_PER_CPU(struct xfsstats, xfsstats); + +/* + * We don't disable preempt, not too worried about poking the + * wrong CPU's stat for now (also aggregated before reporting). + */ +#define XFS_STATS_INC(v) (per_cpu(xfsstats, current_cpu()).v++) +#define XFS_STATS_DEC(v) (per_cpu(xfsstats, current_cpu()).v--) +#define XFS_STATS_ADD(v, inc) (per_cpu(xfsstats, current_cpu()).v += (inc)) + +extern int xfs_init_procfs(void); +extern void xfs_cleanup_procfs(void); + + +#else /* !CONFIG_PROC_FS */ + +# define XFS_STATS_INC(count) +# define XFS_STATS_DEC(count) +# define XFS_STATS_ADD(count, inc) + +static inline int xfs_init_procfs(void) +{ + return 0; +} + +static inline void xfs_cleanup_procfs(void) +{ +} + +#endif /* !CONFIG_PROC_FS */ + +#endif /* __XFS_STATS_H__ */ diff --git a/kernel/fs/xfs/xfs_super.c b/kernel/fs/xfs/xfs_super.c new file mode 100644 index 000000000..858e1e62b --- /dev/null +++ b/kernel/fs/xfs/xfs_super.c @@ -0,0 +1,1889 @@ +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "xfs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_inode.h" +#include "xfs_btree.h" +#include "xfs_bmap.h" +#include "xfs_alloc.h" +#include "xfs_error.h" +#include "xfs_fsops.h" +#include "xfs_trans.h" +#include "xfs_buf_item.h" +#include "xfs_log.h" +#include "xfs_log_priv.h" +#include "xfs_da_btree.h" +#include "xfs_dir2.h" +#include "xfs_extfree_item.h" +#include "xfs_mru_cache.h" +#include "xfs_inode_item.h" +#include "xfs_icache.h" +#include "xfs_trace.h" +#include "xfs_icreate_item.h" +#include "xfs_filestream.h" +#include "xfs_quota.h" +#include "xfs_sysfs.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static const struct super_operations xfs_super_operations; +static kmem_zone_t *xfs_ioend_zone; +mempool_t *xfs_ioend_pool; + +static struct kset *xfs_kset; /* top-level xfs sysfs dir */ +#ifdef DEBUG +static struct xfs_kobj xfs_dbg_kobj; /* global debug sysfs attrs */ +#endif + +#define MNTOPT_LOGBUFS "logbufs" /* number of XFS log buffers */ +#define MNTOPT_LOGBSIZE "logbsize" /* size of XFS log buffers */ +#define MNTOPT_LOGDEV "logdev" /* log device */ +#define MNTOPT_RTDEV "rtdev" /* realtime I/O device */ +#define MNTOPT_BIOSIZE "biosize" /* log2 of preferred buffered io size */ +#define MNTOPT_WSYNC "wsync" /* safe-mode nfs compatible mount */ +#define MNTOPT_NOALIGN "noalign" /* turn off stripe alignment */ +#define MNTOPT_SWALLOC "swalloc" /* turn on stripe width allocation */ +#define MNTOPT_SUNIT "sunit" /* data volume stripe unit */ +#define MNTOPT_SWIDTH "swidth" /* data volume stripe width */ +#define MNTOPT_NOUUID "nouuid" /* ignore filesystem UUID */ +#define MNTOPT_MTPT "mtpt" /* filesystem mount point */ +#define MNTOPT_GRPID "grpid" /* group-ID from parent directory */ +#define MNTOPT_NOGRPID "nogrpid" /* group-ID from current process */ +#define MNTOPT_BSDGROUPS "bsdgroups" /* group-ID from parent directory */ +#define MNTOPT_SYSVGROUPS "sysvgroups" /* group-ID from current process */ +#define MNTOPT_ALLOCSIZE "allocsize" /* preferred allocation size */ +#define MNTOPT_NORECOVERY "norecovery" /* don't run XFS recovery */ +#define MNTOPT_BARRIER "barrier" /* use writer barriers for log write and + * unwritten extent conversion */ +#define MNTOPT_NOBARRIER "nobarrier" /* .. disable */ +#define MNTOPT_64BITINODE "inode64" /* inodes can be allocated anywhere */ +#define MNTOPT_32BITINODE "inode32" /* inode allocation limited to + * XFS_MAXINUMBER_32 */ +#define MNTOPT_IKEEP "ikeep" /* do not free empty inode clusters */ +#define MNTOPT_NOIKEEP "noikeep" /* free empty inode clusters */ +#define MNTOPT_LARGEIO "largeio" /* report large I/O sizes in stat() */ +#define MNTOPT_NOLARGEIO "nolargeio" /* do not report large I/O sizes + * in stat(). */ +#define MNTOPT_ATTR2 "attr2" /* do use attr2 attribute format */ +#define MNTOPT_NOATTR2 "noattr2" /* do not use attr2 attribute format */ +#define MNTOPT_FILESTREAM "filestreams" /* use filestreams allocator */ +#define MNTOPT_QUOTA "quota" /* disk quotas (user) */ +#define MNTOPT_NOQUOTA "noquota" /* no quotas */ +#define MNTOPT_USRQUOTA "usrquota" /* user quota enabled */ +#define MNTOPT_GRPQUOTA "grpquota" /* group quota enabled */ +#define MNTOPT_PRJQUOTA "prjquota" /* project quota enabled */ +#define MNTOPT_UQUOTA "uquota" /* user quota (IRIX variant) */ +#define MNTOPT_GQUOTA "gquota" /* group quota (IRIX variant) */ +#define MNTOPT_PQUOTA "pquota" /* project quota (IRIX variant) */ +#define MNTOPT_UQUOTANOENF "uqnoenforce"/* user quota limit enforcement */ +#define MNTOPT_GQUOTANOENF "gqnoenforce"/* group quota limit enforcement */ +#define MNTOPT_PQUOTANOENF "pqnoenforce"/* project quota limit enforcement */ +#define MNTOPT_QUOTANOENF "qnoenforce" /* same as uqnoenforce */ +#define MNTOPT_DISCARD "discard" /* Discard unused blocks */ +#define MNTOPT_NODISCARD "nodiscard" /* Do not discard unused blocks */ + +/* + * Table driven mount option parser. + * + * Currently only used for remount, but it will be used for mount + * in the future, too. + */ +enum { + Opt_barrier, + Opt_nobarrier, + Opt_inode64, + Opt_inode32, + Opt_err +}; + +static const match_table_t tokens = { + {Opt_barrier, "barrier"}, + {Opt_nobarrier, "nobarrier"}, + {Opt_inode64, "inode64"}, + {Opt_inode32, "inode32"}, + {Opt_err, NULL} +}; + + +STATIC unsigned long +suffix_kstrtoint(char *s, unsigned int base, int *res) +{ + int last, shift_left_factor = 0, _res; + char *value = s; + + last = strlen(value) - 1; + if (value[last] == 'K' || value[last] == 'k') { + shift_left_factor = 10; + value[last] = '\0'; + } + if (value[last] == 'M' || value[last] == 'm') { + shift_left_factor = 20; + value[last] = '\0'; + } + if (value[last] == 'G' || value[last] == 'g') { + shift_left_factor = 30; + value[last] = '\0'; + } + + if (kstrtoint(s, base, &_res)) + return -EINVAL; + *res = _res << shift_left_factor; + return 0; +} + +/* + * This function fills in xfs_mount_t fields based on mount args. + * Note: the superblock has _not_ yet been read in. + * + * Note that this function leaks the various device name allocations on + * failure. The caller takes care of them. + */ +STATIC int +xfs_parseargs( + struct xfs_mount *mp, + char *options) +{ + struct super_block *sb = mp->m_super; + char *this_char, *value; + int dsunit = 0; + int dswidth = 0; + int iosize = 0; + __uint8_t iosizelog = 0; + + /* + * set up the mount name first so all the errors will refer to the + * correct device. + */ + mp->m_fsname = kstrndup(sb->s_id, MAXNAMELEN, GFP_KERNEL); + if (!mp->m_fsname) + return -ENOMEM; + mp->m_fsname_len = strlen(mp->m_fsname) + 1; + + /* + * Copy binary VFS mount flags we are interested in. + */ + if (sb->s_flags & MS_RDONLY) + mp->m_flags |= XFS_MOUNT_RDONLY; + if (sb->s_flags & MS_DIRSYNC) + mp->m_flags |= XFS_MOUNT_DIRSYNC; + if (sb->s_flags & MS_SYNCHRONOUS) + mp->m_flags |= XFS_MOUNT_WSYNC; + + /* + * Set some default flags that could be cleared by the mount option + * parsing. + */ + mp->m_flags |= XFS_MOUNT_BARRIER; + mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE; + + /* + * These can be overridden by the mount option parsing. + */ + mp->m_logbufs = -1; + mp->m_logbsize = -1; + + if (!options) + goto done; + + while ((this_char = strsep(&options, ",")) != NULL) { + if (!*this_char) + continue; + if ((value = strchr(this_char, '=')) != NULL) + *value++ = 0; + + if (!strcmp(this_char, MNTOPT_LOGBUFS)) { + if (!value || !*value) { + xfs_warn(mp, "%s option requires an argument", + this_char); + return -EINVAL; + } + if (kstrtoint(value, 10, &mp->m_logbufs)) + return -EINVAL; + } else if (!strcmp(this_char, MNTOPT_LOGBSIZE)) { + if (!value || !*value) { + xfs_warn(mp, "%s option requires an argument", + this_char); + return -EINVAL; + } + if (suffix_kstrtoint(value, 10, &mp->m_logbsize)) + return -EINVAL; + } else if (!strcmp(this_char, MNTOPT_LOGDEV)) { + if (!value || !*value) { + xfs_warn(mp, "%s option requires an argument", + this_char); + return -EINVAL; + } + mp->m_logname = kstrndup(value, MAXNAMELEN, GFP_KERNEL); + if (!mp->m_logname) + return -ENOMEM; + } else if (!strcmp(this_char, MNTOPT_MTPT)) { + xfs_warn(mp, "%s option not allowed on this system", + this_char); + return -EINVAL; + } else if (!strcmp(this_char, MNTOPT_RTDEV)) { + if (!value || !*value) { + xfs_warn(mp, "%s option requires an argument", + this_char); + return -EINVAL; + } + mp->m_rtname = kstrndup(value, MAXNAMELEN, GFP_KERNEL); + if (!mp->m_rtname) + return -ENOMEM; + } else if (!strcmp(this_char, MNTOPT_BIOSIZE)) { + if (!value || !*value) { + xfs_warn(mp, "%s option requires an argument", + this_char); + return -EINVAL; + } + if (kstrtoint(value, 10, &iosize)) + return -EINVAL; + iosizelog = ffs(iosize) - 1; + } else if (!strcmp(this_char, MNTOPT_ALLOCSIZE)) { + if (!value || !*value) { + xfs_warn(mp, "%s option requires an argument", + this_char); + return -EINVAL; + } + if (suffix_kstrtoint(value, 10, &iosize)) + return -EINVAL; + iosizelog = ffs(iosize) - 1; + } else if (!strcmp(this_char, MNTOPT_GRPID) || + !strcmp(this_char, MNTOPT_BSDGROUPS)) { + mp->m_flags |= XFS_MOUNT_GRPID; + } else if (!strcmp(this_char, MNTOPT_NOGRPID) || + !strcmp(this_char, MNTOPT_SYSVGROUPS)) { + mp->m_flags &= ~XFS_MOUNT_GRPID; + } else if (!strcmp(this_char, MNTOPT_WSYNC)) { + mp->m_flags |= XFS_MOUNT_WSYNC; + } else if (!strcmp(this_char, MNTOPT_NORECOVERY)) { + mp->m_flags |= XFS_MOUNT_NORECOVERY; + } else if (!strcmp(this_char, MNTOPT_NOALIGN)) { + mp->m_flags |= XFS_MOUNT_NOALIGN; + } else if (!strcmp(this_char, MNTOPT_SWALLOC)) { + mp->m_flags |= XFS_MOUNT_SWALLOC; + } else if (!strcmp(this_char, MNTOPT_SUNIT)) { + if (!value || !*value) { + xfs_warn(mp, "%s option requires an argument", + this_char); + return -EINVAL; + } + if (kstrtoint(value, 10, &dsunit)) + return -EINVAL; + } else if (!strcmp(this_char, MNTOPT_SWIDTH)) { + if (!value || !*value) { + xfs_warn(mp, "%s option requires an argument", + this_char); + return -EINVAL; + } + if (kstrtoint(value, 10, &dswidth)) + return -EINVAL; + } else if (!strcmp(this_char, MNTOPT_32BITINODE)) { + mp->m_flags |= XFS_MOUNT_SMALL_INUMS; + } else if (!strcmp(this_char, MNTOPT_64BITINODE)) { + mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS; + } else if (!strcmp(this_char, MNTOPT_NOUUID)) { + mp->m_flags |= XFS_MOUNT_NOUUID; + } else if (!strcmp(this_char, MNTOPT_BARRIER)) { + mp->m_flags |= XFS_MOUNT_BARRIER; + } else if (!strcmp(this_char, MNTOPT_NOBARRIER)) { + mp->m_flags &= ~XFS_MOUNT_BARRIER; + } else if (!strcmp(this_char, MNTOPT_IKEEP)) { + mp->m_flags |= XFS_MOUNT_IKEEP; + } else if (!strcmp(this_char, MNTOPT_NOIKEEP)) { + mp->m_flags &= ~XFS_MOUNT_IKEEP; + } else if (!strcmp(this_char, MNTOPT_LARGEIO)) { + mp->m_flags &= ~XFS_MOUNT_COMPAT_IOSIZE; + } else if (!strcmp(this_char, MNTOPT_NOLARGEIO)) { + mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE; + } else if (!strcmp(this_char, MNTOPT_ATTR2)) { + mp->m_flags |= XFS_MOUNT_ATTR2; + } else if (!strcmp(this_char, MNTOPT_NOATTR2)) { + mp->m_flags &= ~XFS_MOUNT_ATTR2; + mp->m_flags |= XFS_MOUNT_NOATTR2; + } else if (!strcmp(this_char, MNTOPT_FILESTREAM)) { + mp->m_flags |= XFS_MOUNT_FILESTREAMS; + } else if (!strcmp(this_char, MNTOPT_NOQUOTA)) { + mp->m_qflags &= ~XFS_ALL_QUOTA_ACCT; + mp->m_qflags &= ~XFS_ALL_QUOTA_ENFD; + mp->m_qflags &= ~XFS_ALL_QUOTA_ACTIVE; + } else if (!strcmp(this_char, MNTOPT_QUOTA) || + !strcmp(this_char, MNTOPT_UQUOTA) || + !strcmp(this_char, MNTOPT_USRQUOTA)) { + mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE | + XFS_UQUOTA_ENFD); + } else if (!strcmp(this_char, MNTOPT_QUOTANOENF) || + !strcmp(this_char, MNTOPT_UQUOTANOENF)) { + mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE); + mp->m_qflags &= ~XFS_UQUOTA_ENFD; + } else if (!strcmp(this_char, MNTOPT_PQUOTA) || + !strcmp(this_char, MNTOPT_PRJQUOTA)) { + mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE | + XFS_PQUOTA_ENFD); + } else if (!strcmp(this_char, MNTOPT_PQUOTANOENF)) { + mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE); + mp->m_qflags &= ~XFS_PQUOTA_ENFD; + } else if (!strcmp(this_char, MNTOPT_GQUOTA) || + !strcmp(this_char, MNTOPT_GRPQUOTA)) { + mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE | + XFS_GQUOTA_ENFD); + } else if (!strcmp(this_char, MNTOPT_GQUOTANOENF)) { + mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE); + mp->m_qflags &= ~XFS_GQUOTA_ENFD; + } else if (!strcmp(this_char, MNTOPT_DISCARD)) { + mp->m_flags |= XFS_MOUNT_DISCARD; + } else if (!strcmp(this_char, MNTOPT_NODISCARD)) { + mp->m_flags &= ~XFS_MOUNT_DISCARD; + } else { + xfs_warn(mp, "unknown mount option [%s].", this_char); + return -EINVAL; + } + } + + /* + * no recovery flag requires a read-only mount + */ + if ((mp->m_flags & XFS_MOUNT_NORECOVERY) && + !(mp->m_flags & XFS_MOUNT_RDONLY)) { + xfs_warn(mp, "no-recovery mounts must be read-only."); + return -EINVAL; + } + + if ((mp->m_flags & XFS_MOUNT_NOALIGN) && (dsunit || dswidth)) { + xfs_warn(mp, + "sunit and swidth options incompatible with the noalign option"); + return -EINVAL; + } + +#ifndef CONFIG_XFS_QUOTA + if (XFS_IS_QUOTA_RUNNING(mp)) { + xfs_warn(mp, "quota support not available in this kernel."); + return -EINVAL; + } +#endif + + if ((dsunit && !dswidth) || (!dsunit && dswidth)) { + xfs_warn(mp, "sunit and swidth must be specified together"); + return -EINVAL; + } + + if (dsunit && (dswidth % dsunit != 0)) { + xfs_warn(mp, + "stripe width (%d) must be a multiple of the stripe unit (%d)", + dswidth, dsunit); + return -EINVAL; + } + +done: + if (dsunit && !(mp->m_flags & XFS_MOUNT_NOALIGN)) { + /* + * At this point the superblock has not been read + * in, therefore we do not know the block size. + * Before the mount call ends we will convert + * these to FSBs. + */ + mp->m_dalign = dsunit; + mp->m_swidth = dswidth; + } + + if (mp->m_logbufs != -1 && + mp->m_logbufs != 0 && + (mp->m_logbufs < XLOG_MIN_ICLOGS || + mp->m_logbufs > XLOG_MAX_ICLOGS)) { + xfs_warn(mp, "invalid logbufs value: %d [not %d-%d]", + mp->m_logbufs, XLOG_MIN_ICLOGS, XLOG_MAX_ICLOGS); + return -EINVAL; + } + if (mp->m_logbsize != -1 && + mp->m_logbsize != 0 && + (mp->m_logbsize < XLOG_MIN_RECORD_BSIZE || + mp->m_logbsize > XLOG_MAX_RECORD_BSIZE || + !is_power_of_2(mp->m_logbsize))) { + xfs_warn(mp, + "invalid logbufsize: %d [not 16k,32k,64k,128k or 256k]", + mp->m_logbsize); + return -EINVAL; + } + + if (iosizelog) { + if (iosizelog > XFS_MAX_IO_LOG || + iosizelog < XFS_MIN_IO_LOG) { + xfs_warn(mp, "invalid log iosize: %d [not %d-%d]", + iosizelog, XFS_MIN_IO_LOG, + XFS_MAX_IO_LOG); + return -EINVAL; + } + + mp->m_flags |= XFS_MOUNT_DFLT_IOSIZE; + mp->m_readio_log = iosizelog; + mp->m_writeio_log = iosizelog; + } + + return 0; +} + +struct proc_xfs_info { + int flag; + char *str; +}; + +STATIC int +xfs_showargs( + struct xfs_mount *mp, + struct seq_file *m) +{ + static struct proc_xfs_info xfs_info_set[] = { + /* the few simple ones we can get from the mount struct */ + { XFS_MOUNT_IKEEP, "," MNTOPT_IKEEP }, + { XFS_MOUNT_WSYNC, "," MNTOPT_WSYNC }, + { XFS_MOUNT_NOALIGN, "," MNTOPT_NOALIGN }, + { XFS_MOUNT_SWALLOC, "," MNTOPT_SWALLOC }, + { XFS_MOUNT_NOUUID, "," MNTOPT_NOUUID }, + { XFS_MOUNT_NORECOVERY, "," MNTOPT_NORECOVERY }, + { XFS_MOUNT_ATTR2, "," MNTOPT_ATTR2 }, + { XFS_MOUNT_FILESTREAMS, "," MNTOPT_FILESTREAM }, + { XFS_MOUNT_GRPID, "," MNTOPT_GRPID }, + { XFS_MOUNT_DISCARD, "," MNTOPT_DISCARD }, + { XFS_MOUNT_SMALL_INUMS, "," MNTOPT_32BITINODE }, + { 0, NULL } + }; + static struct proc_xfs_info xfs_info_unset[] = { + /* the few simple ones we can get from the mount struct */ + { XFS_MOUNT_COMPAT_IOSIZE, "," MNTOPT_LARGEIO }, + { XFS_MOUNT_BARRIER, "," MNTOPT_NOBARRIER }, + { XFS_MOUNT_SMALL_INUMS, "," MNTOPT_64BITINODE }, + { 0, NULL } + }; + struct proc_xfs_info *xfs_infop; + + for (xfs_infop = xfs_info_set; xfs_infop->flag; xfs_infop++) { + if (mp->m_flags & xfs_infop->flag) + seq_puts(m, xfs_infop->str); + } + for (xfs_infop = xfs_info_unset; xfs_infop->flag; xfs_infop++) { + if (!(mp->m_flags & xfs_infop->flag)) + seq_puts(m, xfs_infop->str); + } + + if (mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) + seq_printf(m, "," MNTOPT_ALLOCSIZE "=%dk", + (int)(1 << mp->m_writeio_log) >> 10); + + if (mp->m_logbufs > 0) + seq_printf(m, "," MNTOPT_LOGBUFS "=%d", mp->m_logbufs); + if (mp->m_logbsize > 0) + seq_printf(m, "," MNTOPT_LOGBSIZE "=%dk", mp->m_logbsize >> 10); + + if (mp->m_logname) + seq_printf(m, "," MNTOPT_LOGDEV "=%s", mp->m_logname); + if (mp->m_rtname) + seq_printf(m, "," MNTOPT_RTDEV "=%s", mp->m_rtname); + + if (mp->m_dalign > 0) + seq_printf(m, "," MNTOPT_SUNIT "=%d", + (int)XFS_FSB_TO_BB(mp, mp->m_dalign)); + if (mp->m_swidth > 0) + seq_printf(m, "," MNTOPT_SWIDTH "=%d", + (int)XFS_FSB_TO_BB(mp, mp->m_swidth)); + + if (mp->m_qflags & (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD)) + seq_puts(m, "," MNTOPT_USRQUOTA); + else if (mp->m_qflags & XFS_UQUOTA_ACCT) + seq_puts(m, "," MNTOPT_UQUOTANOENF); + + if (mp->m_qflags & XFS_PQUOTA_ACCT) { + if (mp->m_qflags & XFS_PQUOTA_ENFD) + seq_puts(m, "," MNTOPT_PRJQUOTA); + else + seq_puts(m, "," MNTOPT_PQUOTANOENF); + } + if (mp->m_qflags & XFS_GQUOTA_ACCT) { + if (mp->m_qflags & XFS_GQUOTA_ENFD) + seq_puts(m, "," MNTOPT_GRPQUOTA); + else + seq_puts(m, "," MNTOPT_GQUOTANOENF); + } + + if (!(mp->m_qflags & XFS_ALL_QUOTA_ACCT)) + seq_puts(m, "," MNTOPT_NOQUOTA); + + return 0; +} +__uint64_t +xfs_max_file_offset( + unsigned int blockshift) +{ + unsigned int pagefactor = 1; + unsigned int bitshift = BITS_PER_LONG - 1; + + /* Figure out maximum filesize, on Linux this can depend on + * the filesystem blocksize (on 32 bit platforms). + * __block_write_begin does this in an [unsigned] long... + * page->index << (PAGE_CACHE_SHIFT - bbits) + * So, for page sized blocks (4K on 32 bit platforms), + * this wraps at around 8Tb (hence MAX_LFS_FILESIZE which is + * (((u64)PAGE_CACHE_SIZE << (BITS_PER_LONG-1))-1) + * but for smaller blocksizes it is less (bbits = log2 bsize). + * Note1: get_block_t takes a long (implicit cast from above) + * Note2: The Large Block Device (LBD and HAVE_SECTOR_T) patch + * can optionally convert the [unsigned] long from above into + * an [unsigned] long long. + */ + +#if BITS_PER_LONG == 32 +# if defined(CONFIG_LBDAF) + ASSERT(sizeof(sector_t) == 8); + pagefactor = PAGE_CACHE_SIZE; + bitshift = BITS_PER_LONG; +# else + pagefactor = PAGE_CACHE_SIZE >> (PAGE_CACHE_SHIFT - blockshift); +# endif +#endif + + return (((__uint64_t)pagefactor) << bitshift) - 1; +} + +/* + * xfs_set_inode32() and xfs_set_inode64() are passed an agcount + * because in the growfs case, mp->m_sb.sb_agcount is not updated + * yet to the potentially higher ag count. + */ +xfs_agnumber_t +xfs_set_inode32(struct xfs_mount *mp, xfs_agnumber_t agcount) +{ + xfs_agnumber_t index = 0; + xfs_agnumber_t maxagi = 0; + xfs_sb_t *sbp = &mp->m_sb; + xfs_agnumber_t max_metadata; + xfs_agino_t agino; + xfs_ino_t ino; + xfs_perag_t *pag; + + /* Calculate how much should be reserved for inodes to meet + * the max inode percentage. + */ + if (mp->m_maxicount) { + __uint64_t icount; + + icount = sbp->sb_dblocks * sbp->sb_imax_pct; + do_div(icount, 100); + icount += sbp->sb_agblocks - 1; + do_div(icount, sbp->sb_agblocks); + max_metadata = icount; + } else { + max_metadata = agcount; + } + + agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0); + + for (index = 0; index < agcount; index++) { + ino = XFS_AGINO_TO_INO(mp, index, agino); + + if (ino > XFS_MAXINUMBER_32) { + pag = xfs_perag_get(mp, index); + pag->pagi_inodeok = 0; + pag->pagf_metadata = 0; + xfs_perag_put(pag); + continue; + } + + pag = xfs_perag_get(mp, index); + pag->pagi_inodeok = 1; + maxagi++; + if (index < max_metadata) + pag->pagf_metadata = 1; + xfs_perag_put(pag); + } + mp->m_flags |= (XFS_MOUNT_32BITINODES | + XFS_MOUNT_SMALL_INUMS); + + return maxagi; +} + +xfs_agnumber_t +xfs_set_inode64(struct xfs_mount *mp, xfs_agnumber_t agcount) +{ + xfs_agnumber_t index = 0; + + for (index = 0; index < agcount; index++) { + struct xfs_perag *pag; + + pag = xfs_perag_get(mp, index); + pag->pagi_inodeok = 1; + pag->pagf_metadata = 0; + xfs_perag_put(pag); + } + + /* There is no need for lock protection on m_flags, + * the rw_semaphore of the VFS superblock is locked + * during mount/umount/remount operations, so this is + * enough to avoid concurency on the m_flags field + */ + mp->m_flags &= ~(XFS_MOUNT_32BITINODES | + XFS_MOUNT_SMALL_INUMS); + return index; +} + +STATIC int +xfs_blkdev_get( + xfs_mount_t *mp, + const char *name, + struct block_device **bdevp) +{ + int error = 0; + + *bdevp = blkdev_get_by_path(name, FMODE_READ|FMODE_WRITE|FMODE_EXCL, + mp); + if (IS_ERR(*bdevp)) { + error = PTR_ERR(*bdevp); + xfs_warn(mp, "Invalid device [%s], error=%d", name, error); + } + + return error; +} + +STATIC void +xfs_blkdev_put( + struct block_device *bdev) +{ + if (bdev) + blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); +} + +void +xfs_blkdev_issue_flush( + xfs_buftarg_t *buftarg) +{ + blkdev_issue_flush(buftarg->bt_bdev, GFP_NOFS, NULL); +} + +STATIC void +xfs_close_devices( + struct xfs_mount *mp) +{ + if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) { + struct block_device *logdev = mp->m_logdev_targp->bt_bdev; + xfs_free_buftarg(mp, mp->m_logdev_targp); + xfs_blkdev_put(logdev); + } + if (mp->m_rtdev_targp) { + struct block_device *rtdev = mp->m_rtdev_targp->bt_bdev; + xfs_free_buftarg(mp, mp->m_rtdev_targp); + xfs_blkdev_put(rtdev); + } + xfs_free_buftarg(mp, mp->m_ddev_targp); +} + +/* + * The file system configurations are: + * (1) device (partition) with data and internal log + * (2) logical volume with data and log subvolumes. + * (3) logical volume with data, log, and realtime subvolumes. + * + * We only have to handle opening the log and realtime volumes here if + * they are present. The data subvolume has already been opened by + * get_sb_bdev() and is stored in sb->s_bdev. + */ +STATIC int +xfs_open_devices( + struct xfs_mount *mp) +{ + struct block_device *ddev = mp->m_super->s_bdev; + struct block_device *logdev = NULL, *rtdev = NULL; + int error; + + /* + * Open real time and log devices - order is important. + */ + if (mp->m_logname) { + error = xfs_blkdev_get(mp, mp->m_logname, &logdev); + if (error) + goto out; + } + + if (mp->m_rtname) { + error = xfs_blkdev_get(mp, mp->m_rtname, &rtdev); + if (error) + goto out_close_logdev; + + if (rtdev == ddev || rtdev == logdev) { + xfs_warn(mp, + "Cannot mount filesystem with identical rtdev and ddev/logdev."); + error = -EINVAL; + goto out_close_rtdev; + } + } + + /* + * Setup xfs_mount buffer target pointers + */ + error = -ENOMEM; + mp->m_ddev_targp = xfs_alloc_buftarg(mp, ddev); + if (!mp->m_ddev_targp) + goto out_close_rtdev; + + if (rtdev) { + mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev); + if (!mp->m_rtdev_targp) + goto out_free_ddev_targ; + } + + if (logdev && logdev != ddev) { + mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev); + if (!mp->m_logdev_targp) + goto out_free_rtdev_targ; + } else { + mp->m_logdev_targp = mp->m_ddev_targp; + } + + return 0; + + out_free_rtdev_targ: + if (mp->m_rtdev_targp) + xfs_free_buftarg(mp, mp->m_rtdev_targp); + out_free_ddev_targ: + xfs_free_buftarg(mp, mp->m_ddev_targp); + out_close_rtdev: + xfs_blkdev_put(rtdev); + out_close_logdev: + if (logdev && logdev != ddev) + xfs_blkdev_put(logdev); + out: + return error; +} + +/* + * Setup xfs_mount buffer target pointers based on superblock + */ +STATIC int +xfs_setup_devices( + struct xfs_mount *mp) +{ + int error; + + error = xfs_setsize_buftarg(mp->m_ddev_targp, mp->m_sb.sb_sectsize); + if (error) + return error; + + if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) { + unsigned int log_sector_size = BBSIZE; + + if (xfs_sb_version_hassector(&mp->m_sb)) + log_sector_size = mp->m_sb.sb_logsectsize; + error = xfs_setsize_buftarg(mp->m_logdev_targp, + log_sector_size); + if (error) + return error; + } + if (mp->m_rtdev_targp) { + error = xfs_setsize_buftarg(mp->m_rtdev_targp, + mp->m_sb.sb_sectsize); + if (error) + return error; + } + + return 0; +} + +STATIC int +xfs_init_mount_workqueues( + struct xfs_mount *mp) +{ + mp->m_buf_workqueue = alloc_workqueue("xfs-buf/%s", + WQ_MEM_RECLAIM|WQ_FREEZABLE, 1, mp->m_fsname); + if (!mp->m_buf_workqueue) + goto out; + + mp->m_data_workqueue = alloc_workqueue("xfs-data/%s", + WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname); + if (!mp->m_data_workqueue) + goto out_destroy_buf; + + mp->m_unwritten_workqueue = alloc_workqueue("xfs-conv/%s", + WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname); + if (!mp->m_unwritten_workqueue) + goto out_destroy_data_iodone_queue; + + mp->m_cil_workqueue = alloc_workqueue("xfs-cil/%s", + WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname); + if (!mp->m_cil_workqueue) + goto out_destroy_unwritten; + + mp->m_reclaim_workqueue = alloc_workqueue("xfs-reclaim/%s", + WQ_FREEZABLE, 0, mp->m_fsname); + if (!mp->m_reclaim_workqueue) + goto out_destroy_cil; + + mp->m_log_workqueue = alloc_workqueue("xfs-log/%s", + WQ_FREEZABLE|WQ_HIGHPRI, 0, mp->m_fsname); + if (!mp->m_log_workqueue) + goto out_destroy_reclaim; + + mp->m_eofblocks_workqueue = alloc_workqueue("xfs-eofblocks/%s", + WQ_FREEZABLE, 0, mp->m_fsname); + if (!mp->m_eofblocks_workqueue) + goto out_destroy_log; + + return 0; + +out_destroy_log: + destroy_workqueue(mp->m_log_workqueue); +out_destroy_reclaim: + destroy_workqueue(mp->m_reclaim_workqueue); +out_destroy_cil: + destroy_workqueue(mp->m_cil_workqueue); +out_destroy_unwritten: + destroy_workqueue(mp->m_unwritten_workqueue); +out_destroy_data_iodone_queue: + destroy_workqueue(mp->m_data_workqueue); +out_destroy_buf: + destroy_workqueue(mp->m_buf_workqueue); +out: + return -ENOMEM; +} + +STATIC void +xfs_destroy_mount_workqueues( + struct xfs_mount *mp) +{ + destroy_workqueue(mp->m_eofblocks_workqueue); + destroy_workqueue(mp->m_log_workqueue); + destroy_workqueue(mp->m_reclaim_workqueue); + destroy_workqueue(mp->m_cil_workqueue); + destroy_workqueue(mp->m_data_workqueue); + destroy_workqueue(mp->m_unwritten_workqueue); + destroy_workqueue(mp->m_buf_workqueue); +} + +/* + * Flush all dirty data to disk. Must not be called while holding an XFS_ILOCK + * or a page lock. We use sync_inodes_sb() here to ensure we block while waiting + * for IO to complete so that we effectively throttle multiple callers to the + * rate at which IO is completing. + */ +void +xfs_flush_inodes( + struct xfs_mount *mp) +{ + struct super_block *sb = mp->m_super; + + if (down_read_trylock(&sb->s_umount)) { + sync_inodes_sb(sb); + up_read(&sb->s_umount); + } +} + +/* Catch misguided souls that try to use this interface on XFS */ +STATIC struct inode * +xfs_fs_alloc_inode( + struct super_block *sb) +{ + BUG(); + return NULL; +} + +/* + * Now that the generic code is guaranteed not to be accessing + * the linux inode, we can reclaim the inode. + */ +STATIC void +xfs_fs_destroy_inode( + struct inode *inode) +{ + struct xfs_inode *ip = XFS_I(inode); + + trace_xfs_destroy_inode(ip); + + XFS_STATS_INC(vn_reclaim); + + ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0); + + /* + * We should never get here with one of the reclaim flags already set. + */ + ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIMABLE)); + ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIM)); + + /* + * We always use background reclaim here because even if the + * inode is clean, it still may be under IO and hence we have + * to take the flush lock. The background reclaim path handles + * this more efficiently than we can here, so simply let background + * reclaim tear down all inodes. + */ + xfs_inode_set_reclaim_tag(ip); +} + +/* + * Slab object creation initialisation for the XFS inode. + * This covers only the idempotent fields in the XFS inode; + * all other fields need to be initialised on allocation + * from the slab. This avoids the need to repeatedly initialise + * fields in the xfs inode that left in the initialise state + * when freeing the inode. + */ +STATIC void +xfs_fs_inode_init_once( + void *inode) +{ + struct xfs_inode *ip = inode; + + memset(ip, 0, sizeof(struct xfs_inode)); + + /* vfs inode */ + inode_init_once(VFS_I(ip)); + + /* xfs inode */ + atomic_set(&ip->i_pincount, 0); + spin_lock_init(&ip->i_flags_lock); + + mrlock_init(&ip->i_mmaplock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER, + "xfsino", ip->i_ino); + mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER, + "xfsino", ip->i_ino); +} + +STATIC void +xfs_fs_evict_inode( + struct inode *inode) +{ + xfs_inode_t *ip = XFS_I(inode); + + ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock)); + + trace_xfs_evict_inode(ip); + + truncate_inode_pages_final(&inode->i_data); + clear_inode(inode); + XFS_STATS_INC(vn_rele); + XFS_STATS_INC(vn_remove); + + xfs_inactive(ip); +} + +/* + * We do an unlocked check for XFS_IDONTCACHE here because we are already + * serialised against cache hits here via the inode->i_lock and igrab() in + * xfs_iget_cache_hit(). Hence a lookup that might clear this flag will not be + * racing with us, and it avoids needing to grab a spinlock here for every inode + * we drop the final reference on. + */ +STATIC int +xfs_fs_drop_inode( + struct inode *inode) +{ + struct xfs_inode *ip = XFS_I(inode); + + return generic_drop_inode(inode) || (ip->i_flags & XFS_IDONTCACHE); +} + +STATIC void +xfs_free_fsname( + struct xfs_mount *mp) +{ + kfree(mp->m_fsname); + kfree(mp->m_rtname); + kfree(mp->m_logname); +} + +STATIC int +xfs_fs_sync_fs( + struct super_block *sb, + int wait) +{ + struct xfs_mount *mp = XFS_M(sb); + + /* + * Doing anything during the async pass would be counterproductive. + */ + if (!wait) + return 0; + + xfs_log_force(mp, XFS_LOG_SYNC); + if (laptop_mode) { + /* + * The disk must be active because we're syncing. + * We schedule log work now (now that the disk is + * active) instead of later (when it might not be). + */ + flush_delayed_work(&mp->m_log->l_work); + } + + return 0; +} + +STATIC int +xfs_fs_statfs( + struct dentry *dentry, + struct kstatfs *statp) +{ + struct xfs_mount *mp = XFS_M(dentry->d_sb); + xfs_sb_t *sbp = &mp->m_sb; + struct xfs_inode *ip = XFS_I(d_inode(dentry)); + __uint64_t fakeinos, id; + __uint64_t icount; + __uint64_t ifree; + __uint64_t fdblocks; + xfs_extlen_t lsize; + __int64_t ffree; + + statp->f_type = XFS_SB_MAGIC; + statp->f_namelen = MAXNAMELEN - 1; + + id = huge_encode_dev(mp->m_ddev_targp->bt_dev); + statp->f_fsid.val[0] = (u32)id; + statp->f_fsid.val[1] = (u32)(id >> 32); + + icount = percpu_counter_sum(&mp->m_icount); + ifree = percpu_counter_sum(&mp->m_ifree); + fdblocks = percpu_counter_sum(&mp->m_fdblocks); + + spin_lock(&mp->m_sb_lock); + statp->f_bsize = sbp->sb_blocksize; + lsize = sbp->sb_logstart ? sbp->sb_logblocks : 0; + statp->f_blocks = sbp->sb_dblocks - lsize; + spin_unlock(&mp->m_sb_lock); + + statp->f_bfree = fdblocks - XFS_ALLOC_SET_ASIDE(mp); + statp->f_bavail = statp->f_bfree; + + fakeinos = statp->f_bfree << sbp->sb_inopblog; + statp->f_files = MIN(icount + fakeinos, (__uint64_t)XFS_MAXINUMBER); + if (mp->m_maxicount) + statp->f_files = min_t(typeof(statp->f_files), + statp->f_files, + mp->m_maxicount); + + /* If sb_icount overshot maxicount, report actual allocation */ + statp->f_files = max_t(typeof(statp->f_files), + statp->f_files, + sbp->sb_icount); + + /* make sure statp->f_ffree does not underflow */ + ffree = statp->f_files - (icount - ifree); + statp->f_ffree = max_t(__int64_t, ffree, 0); + + + if ((ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && + ((mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD))) == + (XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD)) + xfs_qm_statvfs(ip, statp); + return 0; +} + +STATIC void +xfs_save_resvblks(struct xfs_mount *mp) +{ + __uint64_t resblks = 0; + + mp->m_resblks_save = mp->m_resblks; + xfs_reserve_blocks(mp, &resblks, NULL); +} + +STATIC void +xfs_restore_resvblks(struct xfs_mount *mp) +{ + __uint64_t resblks; + + if (mp->m_resblks_save) { + resblks = mp->m_resblks_save; + mp->m_resblks_save = 0; + } else + resblks = xfs_default_resblks(mp); + + xfs_reserve_blocks(mp, &resblks, NULL); +} + +/* + * Trigger writeback of all the dirty metadata in the file system. + * + * This ensures that the metadata is written to their location on disk rather + * than just existing in transactions in the log. This means after a quiesce + * there is no log replay required to write the inodes to disk - this is the + * primary difference between a sync and a quiesce. + * + * Note: xfs_log_quiesce() stops background log work - the callers must ensure + * it is started again when appropriate. + */ +static void +xfs_quiesce_attr( + struct xfs_mount *mp) +{ + int error = 0; + + /* wait for all modifications to complete */ + while (atomic_read(&mp->m_active_trans) > 0) + delay(100); + + /* force the log to unpin objects from the now complete transactions */ + xfs_log_force(mp, XFS_LOG_SYNC); + + /* reclaim inodes to do any IO before the freeze completes */ + xfs_reclaim_inodes(mp, 0); + xfs_reclaim_inodes(mp, SYNC_WAIT); + + /* Push the superblock and write an unmount record */ + error = xfs_log_sbcount(mp); + if (error) + xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. " + "Frozen image may not be consistent."); + /* + * Just warn here till VFS can correctly support + * read-only remount without racing. + */ + WARN_ON(atomic_read(&mp->m_active_trans) != 0); + + xfs_log_quiesce(mp); +} + +STATIC int +xfs_fs_remount( + struct super_block *sb, + int *flags, + char *options) +{ + struct xfs_mount *mp = XFS_M(sb); + xfs_sb_t *sbp = &mp->m_sb; + substring_t args[MAX_OPT_ARGS]; + char *p; + int error; + + sync_filesystem(sb); + while ((p = strsep(&options, ",")) != NULL) { + int token; + + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_barrier: + mp->m_flags |= XFS_MOUNT_BARRIER; + break; + case Opt_nobarrier: + mp->m_flags &= ~XFS_MOUNT_BARRIER; + break; + case Opt_inode64: + mp->m_maxagi = xfs_set_inode64(mp, sbp->sb_agcount); + break; + case Opt_inode32: + mp->m_maxagi = xfs_set_inode32(mp, sbp->sb_agcount); + break; + default: + /* + * Logically we would return an error here to prevent + * users from believing they might have changed + * mount options using remount which can't be changed. + * + * But unfortunately mount(8) adds all options from + * mtab and fstab to the mount arguments in some cases + * so we can't blindly reject options, but have to + * check for each specified option if it actually + * differs from the currently set option and only + * reject it if that's the case. + * + * Until that is implemented we return success for + * every remount request, and silently ignore all + * options that we can't actually change. + */ +#if 0 + xfs_info(mp, + "mount option \"%s\" not supported for remount", p); + return -EINVAL; +#else + break; +#endif + } + } + + /* ro -> rw */ + if ((mp->m_flags & XFS_MOUNT_RDONLY) && !(*flags & MS_RDONLY)) { + if (mp->m_flags & XFS_MOUNT_NORECOVERY) { + xfs_warn(mp, + "ro->rw transition prohibited on norecovery mount"); + return -EINVAL; + } + + mp->m_flags &= ~XFS_MOUNT_RDONLY; + + /* + * If this is the first remount to writeable state we + * might have some superblock changes to update. + */ + if (mp->m_update_sb) { + error = xfs_sync_sb(mp, false); + if (error) { + xfs_warn(mp, "failed to write sb changes"); + return error; + } + mp->m_update_sb = false; + } + + /* + * Fill out the reserve pool if it is empty. Use the stashed + * value if it is non-zero, otherwise go with the default. + */ + xfs_restore_resvblks(mp); + xfs_log_work_queue(mp); + } + + /* rw -> ro */ + if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) { + /* + * Before we sync the metadata, we need to free up the reserve + * block pool so that the used block count in the superblock on + * disk is correct at the end of the remount. Stash the current + * reserve pool size so that if we get remounted rw, we can + * return it to the same size. + */ + xfs_save_resvblks(mp); + xfs_quiesce_attr(mp); + mp->m_flags |= XFS_MOUNT_RDONLY; + } + + return 0; +} + +/* + * Second stage of a freeze. The data is already frozen so we only + * need to take care of the metadata. Once that's done sync the superblock + * to the log to dirty it in case of a crash while frozen. This ensures that we + * will recover the unlinked inode lists on the next mount. + */ +STATIC int +xfs_fs_freeze( + struct super_block *sb) +{ + struct xfs_mount *mp = XFS_M(sb); + + xfs_save_resvblks(mp); + xfs_quiesce_attr(mp); + return xfs_sync_sb(mp, true); +} + +STATIC int +xfs_fs_unfreeze( + struct super_block *sb) +{ + struct xfs_mount *mp = XFS_M(sb); + + xfs_restore_resvblks(mp); + xfs_log_work_queue(mp); + return 0; +} + +STATIC int +xfs_fs_show_options( + struct seq_file *m, + struct dentry *root) +{ + return xfs_showargs(XFS_M(root->d_sb), m); +} + +/* + * This function fills in xfs_mount_t fields based on mount args. + * Note: the superblock _has_ now been read in. + */ +STATIC int +xfs_finish_flags( + struct xfs_mount *mp) +{ + int ronly = (mp->m_flags & XFS_MOUNT_RDONLY); + + /* Fail a mount where the logbuf is smaller than the log stripe */ + if (xfs_sb_version_haslogv2(&mp->m_sb)) { + if (mp->m_logbsize <= 0 && + mp->m_sb.sb_logsunit > XLOG_BIG_RECORD_BSIZE) { + mp->m_logbsize = mp->m_sb.sb_logsunit; + } else if (mp->m_logbsize > 0 && + mp->m_logbsize < mp->m_sb.sb_logsunit) { + xfs_warn(mp, + "logbuf size must be greater than or equal to log stripe size"); + return -EINVAL; + } + } else { + /* Fail a mount if the logbuf is larger than 32K */ + if (mp->m_logbsize > XLOG_BIG_RECORD_BSIZE) { + xfs_warn(mp, + "logbuf size for version 1 logs must be 16K or 32K"); + return -EINVAL; + } + } + + /* + * V5 filesystems always use attr2 format for attributes. + */ + if (xfs_sb_version_hascrc(&mp->m_sb) && + (mp->m_flags & XFS_MOUNT_NOATTR2)) { + xfs_warn(mp, +"Cannot mount a V5 filesystem as %s. %s is always enabled for V5 filesystems.", + MNTOPT_NOATTR2, MNTOPT_ATTR2); + return -EINVAL; + } + + /* + * mkfs'ed attr2 will turn on attr2 mount unless explicitly + * told by noattr2 to turn it off + */ + if (xfs_sb_version_hasattr2(&mp->m_sb) && + !(mp->m_flags & XFS_MOUNT_NOATTR2)) + mp->m_flags |= XFS_MOUNT_ATTR2; + + /* + * prohibit r/w mounts of read-only filesystems + */ + if ((mp->m_sb.sb_flags & XFS_SBF_READONLY) && !ronly) { + xfs_warn(mp, + "cannot mount a read-only filesystem as read-write"); + return -EROFS; + } + + if ((mp->m_qflags & (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE)) && + (mp->m_qflags & (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE)) && + !xfs_sb_version_has_pquotino(&mp->m_sb)) { + xfs_warn(mp, + "Super block does not support project and group quota together"); + return -EINVAL; + } + + return 0; +} + +static int +xfs_init_percpu_counters( + struct xfs_mount *mp) +{ + int error; + + error = percpu_counter_init(&mp->m_icount, 0, GFP_KERNEL); + if (error) + return -ENOMEM; + + error = percpu_counter_init(&mp->m_ifree, 0, GFP_KERNEL); + if (error) + goto free_icount; + + error = percpu_counter_init(&mp->m_fdblocks, 0, GFP_KERNEL); + if (error) + goto free_ifree; + + return 0; + +free_ifree: + percpu_counter_destroy(&mp->m_ifree); +free_icount: + percpu_counter_destroy(&mp->m_icount); + return -ENOMEM; +} + +void +xfs_reinit_percpu_counters( + struct xfs_mount *mp) +{ + percpu_counter_set(&mp->m_icount, mp->m_sb.sb_icount); + percpu_counter_set(&mp->m_ifree, mp->m_sb.sb_ifree); + percpu_counter_set(&mp->m_fdblocks, mp->m_sb.sb_fdblocks); +} + +static void +xfs_destroy_percpu_counters( + struct xfs_mount *mp) +{ + percpu_counter_destroy(&mp->m_icount); + percpu_counter_destroy(&mp->m_ifree); + percpu_counter_destroy(&mp->m_fdblocks); +} + +STATIC int +xfs_fs_fill_super( + struct super_block *sb, + void *data, + int silent) +{ + struct inode *root; + struct xfs_mount *mp = NULL; + int flags = 0, error = -ENOMEM; + + mp = kzalloc(sizeof(struct xfs_mount), GFP_KERNEL); + if (!mp) + goto out; + + spin_lock_init(&mp->m_sb_lock); + mutex_init(&mp->m_growlock); + atomic_set(&mp->m_active_trans, 0); + INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); + INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker); + mp->m_kobj.kobject.kset = xfs_kset; + + mp->m_super = sb; + sb->s_fs_info = mp; + + error = xfs_parseargs(mp, (char *)data); + if (error) + goto out_free_fsname; + + sb_min_blocksize(sb, BBSIZE); + sb->s_xattr = xfs_xattr_handlers; + sb->s_export_op = &xfs_export_operations; +#ifdef CONFIG_XFS_QUOTA + sb->s_qcop = &xfs_quotactl_operations; + sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ; +#endif + sb->s_op = &xfs_super_operations; + + if (silent) + flags |= XFS_MFSI_QUIET; + + error = xfs_open_devices(mp); + if (error) + goto out_free_fsname; + + error = xfs_init_mount_workqueues(mp); + if (error) + goto out_close_devices; + + error = xfs_init_percpu_counters(mp); + if (error) + goto out_destroy_workqueues; + + error = xfs_readsb(mp, flags); + if (error) + goto out_destroy_counters; + + error = xfs_finish_flags(mp); + if (error) + goto out_free_sb; + + error = xfs_setup_devices(mp); + if (error) + goto out_free_sb; + + error = xfs_filestream_mount(mp); + if (error) + goto out_free_sb; + + /* + * we must configure the block size in the superblock before we run the + * full mount process as the mount process can lookup and cache inodes. + */ + sb->s_magic = XFS_SB_MAGIC; + sb->s_blocksize = mp->m_sb.sb_blocksize; + sb->s_blocksize_bits = ffs(sb->s_blocksize) - 1; + sb->s_maxbytes = xfs_max_file_offset(sb->s_blocksize_bits); + sb->s_max_links = XFS_MAXLINK; + sb->s_time_gran = 1; + set_posix_acl_flag(sb); + + /* version 5 superblocks support inode version counters. */ + if (XFS_SB_VERSION_NUM(&mp->m_sb) == XFS_SB_VERSION_5) + sb->s_flags |= MS_I_VERSION; + + error = xfs_mountfs(mp); + if (error) + goto out_filestream_unmount; + + root = igrab(VFS_I(mp->m_rootip)); + if (!root) { + error = -ENOENT; + goto out_unmount; + } + sb->s_root = d_make_root(root); + if (!sb->s_root) { + error = -ENOMEM; + goto out_unmount; + } + + return 0; + + out_filestream_unmount: + xfs_filestream_unmount(mp); + out_free_sb: + xfs_freesb(mp); + out_destroy_counters: + xfs_destroy_percpu_counters(mp); +out_destroy_workqueues: + xfs_destroy_mount_workqueues(mp); + out_close_devices: + xfs_close_devices(mp); + out_free_fsname: + xfs_free_fsname(mp); + kfree(mp); + out: + return error; + + out_unmount: + xfs_filestream_unmount(mp); + xfs_unmountfs(mp); + goto out_free_sb; +} + +STATIC void +xfs_fs_put_super( + struct super_block *sb) +{ + struct xfs_mount *mp = XFS_M(sb); + + xfs_notice(mp, "Unmounting Filesystem"); + xfs_filestream_unmount(mp); + xfs_unmountfs(mp); + + xfs_freesb(mp); + xfs_destroy_percpu_counters(mp); + xfs_destroy_mount_workqueues(mp); + xfs_close_devices(mp); + xfs_free_fsname(mp); + kfree(mp); +} + +STATIC struct dentry * +xfs_fs_mount( + struct file_system_type *fs_type, + int flags, + const char *dev_name, + void *data) +{ + return mount_bdev(fs_type, flags, dev_name, data, xfs_fs_fill_super); +} + +static long +xfs_fs_nr_cached_objects( + struct super_block *sb, + struct shrink_control *sc) +{ + return xfs_reclaim_inodes_count(XFS_M(sb)); +} + +static long +xfs_fs_free_cached_objects( + struct super_block *sb, + struct shrink_control *sc) +{ + return xfs_reclaim_inodes_nr(XFS_M(sb), sc->nr_to_scan); +} + +static const struct super_operations xfs_super_operations = { + .alloc_inode = xfs_fs_alloc_inode, + .destroy_inode = xfs_fs_destroy_inode, + .evict_inode = xfs_fs_evict_inode, + .drop_inode = xfs_fs_drop_inode, + .put_super = xfs_fs_put_super, + .sync_fs = xfs_fs_sync_fs, + .freeze_fs = xfs_fs_freeze, + .unfreeze_fs = xfs_fs_unfreeze, + .statfs = xfs_fs_statfs, + .remount_fs = xfs_fs_remount, + .show_options = xfs_fs_show_options, + .nr_cached_objects = xfs_fs_nr_cached_objects, + .free_cached_objects = xfs_fs_free_cached_objects, +}; + +static struct file_system_type xfs_fs_type = { + .owner = THIS_MODULE, + .name = "xfs", + .mount = xfs_fs_mount, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; +MODULE_ALIAS_FS("xfs"); + +STATIC int __init +xfs_init_zones(void) +{ + + xfs_ioend_zone = kmem_zone_init(sizeof(xfs_ioend_t), "xfs_ioend"); + if (!xfs_ioend_zone) + goto out; + + xfs_ioend_pool = mempool_create_slab_pool(4 * MAX_BUF_PER_PAGE, + xfs_ioend_zone); + if (!xfs_ioend_pool) + goto out_destroy_ioend_zone; + + xfs_log_ticket_zone = kmem_zone_init(sizeof(xlog_ticket_t), + "xfs_log_ticket"); + if (!xfs_log_ticket_zone) + goto out_destroy_ioend_pool; + + xfs_bmap_free_item_zone = kmem_zone_init(sizeof(xfs_bmap_free_item_t), + "xfs_bmap_free_item"); + if (!xfs_bmap_free_item_zone) + goto out_destroy_log_ticket_zone; + + xfs_btree_cur_zone = kmem_zone_init(sizeof(xfs_btree_cur_t), + "xfs_btree_cur"); + if (!xfs_btree_cur_zone) + goto out_destroy_bmap_free_item_zone; + + xfs_da_state_zone = kmem_zone_init(sizeof(xfs_da_state_t), + "xfs_da_state"); + if (!xfs_da_state_zone) + goto out_destroy_btree_cur_zone; + + xfs_ifork_zone = kmem_zone_init(sizeof(xfs_ifork_t), "xfs_ifork"); + if (!xfs_ifork_zone) + goto out_destroy_da_state_zone; + + xfs_trans_zone = kmem_zone_init(sizeof(xfs_trans_t), "xfs_trans"); + if (!xfs_trans_zone) + goto out_destroy_ifork_zone; + + xfs_log_item_desc_zone = + kmem_zone_init(sizeof(struct xfs_log_item_desc), + "xfs_log_item_desc"); + if (!xfs_log_item_desc_zone) + goto out_destroy_trans_zone; + + /* + * The size of the zone allocated buf log item is the maximum + * size possible under XFS. This wastes a little bit of memory, + * but it is much faster. + */ + xfs_buf_item_zone = kmem_zone_init(sizeof(struct xfs_buf_log_item), + "xfs_buf_item"); + if (!xfs_buf_item_zone) + goto out_destroy_log_item_desc_zone; + + xfs_efd_zone = kmem_zone_init((sizeof(xfs_efd_log_item_t) + + ((XFS_EFD_MAX_FAST_EXTENTS - 1) * + sizeof(xfs_extent_t))), "xfs_efd_item"); + if (!xfs_efd_zone) + goto out_destroy_buf_item_zone; + + xfs_efi_zone = kmem_zone_init((sizeof(xfs_efi_log_item_t) + + ((XFS_EFI_MAX_FAST_EXTENTS - 1) * + sizeof(xfs_extent_t))), "xfs_efi_item"); + if (!xfs_efi_zone) + goto out_destroy_efd_zone; + + xfs_inode_zone = + kmem_zone_init_flags(sizeof(xfs_inode_t), "xfs_inode", + KM_ZONE_HWALIGN | KM_ZONE_RECLAIM | KM_ZONE_SPREAD, + xfs_fs_inode_init_once); + if (!xfs_inode_zone) + goto out_destroy_efi_zone; + + xfs_ili_zone = + kmem_zone_init_flags(sizeof(xfs_inode_log_item_t), "xfs_ili", + KM_ZONE_SPREAD, NULL); + if (!xfs_ili_zone) + goto out_destroy_inode_zone; + xfs_icreate_zone = kmem_zone_init(sizeof(struct xfs_icreate_item), + "xfs_icr"); + if (!xfs_icreate_zone) + goto out_destroy_ili_zone; + + return 0; + + out_destroy_ili_zone: + kmem_zone_destroy(xfs_ili_zone); + out_destroy_inode_zone: + kmem_zone_destroy(xfs_inode_zone); + out_destroy_efi_zone: + kmem_zone_destroy(xfs_efi_zone); + out_destroy_efd_zone: + kmem_zone_destroy(xfs_efd_zone); + out_destroy_buf_item_zone: + kmem_zone_destroy(xfs_buf_item_zone); + out_destroy_log_item_desc_zone: + kmem_zone_destroy(xfs_log_item_desc_zone); + out_destroy_trans_zone: + kmem_zone_destroy(xfs_trans_zone); + out_destroy_ifork_zone: + kmem_zone_destroy(xfs_ifork_zone); + out_destroy_da_state_zone: + kmem_zone_destroy(xfs_da_state_zone); + out_destroy_btree_cur_zone: + kmem_zone_destroy(xfs_btree_cur_zone); + out_destroy_bmap_free_item_zone: + kmem_zone_destroy(xfs_bmap_free_item_zone); + out_destroy_log_ticket_zone: + kmem_zone_destroy(xfs_log_ticket_zone); + out_destroy_ioend_pool: + mempool_destroy(xfs_ioend_pool); + out_destroy_ioend_zone: + kmem_zone_destroy(xfs_ioend_zone); + out: + return -ENOMEM; +} + +STATIC void +xfs_destroy_zones(void) +{ + /* + * Make sure all delayed rcu free are flushed before we + * destroy caches. + */ + rcu_barrier(); + kmem_zone_destroy(xfs_icreate_zone); + kmem_zone_destroy(xfs_ili_zone); + kmem_zone_destroy(xfs_inode_zone); + kmem_zone_destroy(xfs_efi_zone); + kmem_zone_destroy(xfs_efd_zone); + kmem_zone_destroy(xfs_buf_item_zone); + kmem_zone_destroy(xfs_log_item_desc_zone); + kmem_zone_destroy(xfs_trans_zone); + kmem_zone_destroy(xfs_ifork_zone); + kmem_zone_destroy(xfs_da_state_zone); + kmem_zone_destroy(xfs_btree_cur_zone); + kmem_zone_destroy(xfs_bmap_free_item_zone); + kmem_zone_destroy(xfs_log_ticket_zone); + mempool_destroy(xfs_ioend_pool); + kmem_zone_destroy(xfs_ioend_zone); + +} + +STATIC int __init +xfs_init_workqueues(void) +{ + /* + * The allocation workqueue can be used in memory reclaim situations + * (writepage path), and parallelism is only limited by the number of + * AGs in all the filesystems mounted. Hence use the default large + * max_active value for this workqueue. + */ + xfs_alloc_wq = alloc_workqueue("xfsalloc", + WQ_MEM_RECLAIM|WQ_FREEZABLE, 0); + if (!xfs_alloc_wq) + return -ENOMEM; + + return 0; +} + +STATIC void +xfs_destroy_workqueues(void) +{ + destroy_workqueue(xfs_alloc_wq); +} + +STATIC int __init +init_xfs_fs(void) +{ + int error; + + printk(KERN_INFO XFS_VERSION_STRING " with " + XFS_BUILD_OPTIONS " enabled\n"); + + xfs_dir_startup(); + + error = xfs_init_zones(); + if (error) + goto out; + + error = xfs_init_workqueues(); + if (error) + goto out_destroy_zones; + + error = xfs_mru_cache_init(); + if (error) + goto out_destroy_wq; + + error = xfs_buf_init(); + if (error) + goto out_mru_cache_uninit; + + error = xfs_init_procfs(); + if (error) + goto out_buf_terminate; + + error = xfs_sysctl_register(); + if (error) + goto out_cleanup_procfs; + + xfs_kset = kset_create_and_add("xfs", NULL, fs_kobj); + if (!xfs_kset) { + error = -ENOMEM; + goto out_sysctl_unregister;; + } + +#ifdef DEBUG + xfs_dbg_kobj.kobject.kset = xfs_kset; + error = xfs_sysfs_init(&xfs_dbg_kobj, &xfs_dbg_ktype, NULL, "debug"); + if (error) + goto out_kset_unregister; +#endif + + error = xfs_qm_init(); + if (error) + goto out_remove_kobj; + + error = register_filesystem(&xfs_fs_type); + if (error) + goto out_qm_exit; + return 0; + + out_qm_exit: + xfs_qm_exit(); + out_remove_kobj: +#ifdef DEBUG + xfs_sysfs_del(&xfs_dbg_kobj); + out_kset_unregister: +#endif + kset_unregister(xfs_kset); + out_sysctl_unregister: + xfs_sysctl_unregister(); + out_cleanup_procfs: + xfs_cleanup_procfs(); + out_buf_terminate: + xfs_buf_terminate(); + out_mru_cache_uninit: + xfs_mru_cache_uninit(); + out_destroy_wq: + xfs_destroy_workqueues(); + out_destroy_zones: + xfs_destroy_zones(); + out: + return error; +} + +STATIC void __exit +exit_xfs_fs(void) +{ + xfs_qm_exit(); + unregister_filesystem(&xfs_fs_type); +#ifdef DEBUG + xfs_sysfs_del(&xfs_dbg_kobj); +#endif + kset_unregister(xfs_kset); + xfs_sysctl_unregister(); + xfs_cleanup_procfs(); + xfs_buf_terminate(); + xfs_mru_cache_uninit(); + xfs_destroy_workqueues(); + xfs_destroy_zones(); +} + +module_init(init_xfs_fs); +module_exit(exit_xfs_fs); + +MODULE_AUTHOR("Silicon Graphics, Inc."); +MODULE_DESCRIPTION(XFS_VERSION_STRING " with " XFS_BUILD_OPTIONS " enabled"); +MODULE_LICENSE("GPL"); diff --git a/kernel/fs/xfs/xfs_super.h b/kernel/fs/xfs/xfs_super.h new file mode 100644 index 000000000..499058fea --- /dev/null +++ b/kernel/fs/xfs/xfs_super.h @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_SUPER_H__ +#define __XFS_SUPER_H__ + +#include + +#ifdef CONFIG_XFS_QUOTA +extern int xfs_qm_init(void); +extern void xfs_qm_exit(void); +#else +# define xfs_qm_init() (0) +# define xfs_qm_exit() do { } while (0) +#endif + +#ifdef CONFIG_XFS_POSIX_ACL +# define XFS_ACL_STRING "ACLs, " +# define set_posix_acl_flag(sb) ((sb)->s_flags |= MS_POSIXACL) +#else +# define XFS_ACL_STRING +# define set_posix_acl_flag(sb) do { } while (0) +#endif + +#define XFS_SECURITY_STRING "security attributes, " + +#ifdef CONFIG_XFS_RT +# define XFS_REALTIME_STRING "realtime, " +#else +# define XFS_REALTIME_STRING +#endif + +#ifdef DEBUG +# define XFS_DBG_STRING "debug" +#else +# define XFS_DBG_STRING "no debug" +#endif + +#define XFS_VERSION_STRING "SGI XFS" +#define XFS_BUILD_OPTIONS XFS_ACL_STRING \ + XFS_SECURITY_STRING \ + XFS_REALTIME_STRING \ + XFS_DBG_STRING /* DBG must be last */ + +struct xfs_inode; +struct xfs_mount; +struct xfs_buftarg; +struct block_device; + +extern __uint64_t xfs_max_file_offset(unsigned int); + +extern void xfs_flush_inodes(struct xfs_mount *mp); +extern void xfs_blkdev_issue_flush(struct xfs_buftarg *); +extern xfs_agnumber_t xfs_set_inode32(struct xfs_mount *, xfs_agnumber_t agcount); +extern xfs_agnumber_t xfs_set_inode64(struct xfs_mount *, xfs_agnumber_t agcount); + +extern const struct export_operations xfs_export_operations; +extern const struct xattr_handler *xfs_xattr_handlers[]; +extern const struct quotactl_ops xfs_quotactl_operations; + +extern void xfs_reinit_percpu_counters(struct xfs_mount *mp); + +#define XFS_M(sb) ((struct xfs_mount *)((sb)->s_fs_info)) + +#endif /* __XFS_SUPER_H__ */ diff --git a/kernel/fs/xfs/xfs_symlink.c b/kernel/fs/xfs/xfs_symlink.c new file mode 100644 index 000000000..3df411ead --- /dev/null +++ b/kernel/fs/xfs/xfs_symlink.c @@ -0,0 +1,608 @@ +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * Copyright (c) 2012-2013 Red Hat, Inc. + * All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_shared.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_dir2.h" +#include "xfs_inode.h" +#include "xfs_ialloc.h" +#include "xfs_alloc.h" +#include "xfs_bmap.h" +#include "xfs_bmap_btree.h" +#include "xfs_bmap_util.h" +#include "xfs_error.h" +#include "xfs_quota.h" +#include "xfs_trans_space.h" +#include "xfs_trace.h" +#include "xfs_symlink.h" +#include "xfs_trans.h" +#include "xfs_log.h" + +/* ----- Kernel only functions below ----- */ +STATIC int +xfs_readlink_bmap( + struct xfs_inode *ip, + char *link) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_bmbt_irec mval[XFS_SYMLINK_MAPS]; + struct xfs_buf *bp; + xfs_daddr_t d; + char *cur_chunk; + int pathlen = ip->i_d.di_size; + int nmaps = XFS_SYMLINK_MAPS; + int byte_cnt; + int n; + int error = 0; + int fsblocks = 0; + int offset; + + fsblocks = xfs_symlink_blocks(mp, pathlen); + error = xfs_bmapi_read(ip, 0, fsblocks, mval, &nmaps, 0); + if (error) + goto out; + + offset = 0; + for (n = 0; n < nmaps; n++) { + d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock); + byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); + + bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0, + &xfs_symlink_buf_ops); + if (!bp) + return -ENOMEM; + error = bp->b_error; + if (error) { + xfs_buf_ioerror_alert(bp, __func__); + xfs_buf_relse(bp); + + /* bad CRC means corrupted metadata */ + if (error == -EFSBADCRC) + error = -EFSCORRUPTED; + goto out; + } + byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt); + if (pathlen < byte_cnt) + byte_cnt = pathlen; + + cur_chunk = bp->b_addr; + if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (!xfs_symlink_hdr_ok(ip->i_ino, offset, + byte_cnt, bp)) { + error = -EFSCORRUPTED; + xfs_alert(mp, +"symlink header does not match required off/len/owner (0x%x/Ox%x,0x%llx)", + offset, byte_cnt, ip->i_ino); + xfs_buf_relse(bp); + goto out; + + } + + cur_chunk += sizeof(struct xfs_dsymlink_hdr); + } + + memcpy(link + offset, bp->b_addr, byte_cnt); + + pathlen -= byte_cnt; + offset += byte_cnt; + + xfs_buf_relse(bp); + } + ASSERT(pathlen == 0); + + link[ip->i_d.di_size] = '\0'; + error = 0; + + out: + return error; +} + +int +xfs_readlink( + struct xfs_inode *ip, + char *link) +{ + struct xfs_mount *mp = ip->i_mount; + xfs_fsize_t pathlen; + int error = 0; + + trace_xfs_readlink(ip); + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + xfs_ilock(ip, XFS_ILOCK_SHARED); + + pathlen = ip->i_d.di_size; + if (!pathlen) + goto out; + + if (pathlen < 0 || pathlen > MAXPATHLEN) { + xfs_alert(mp, "%s: inode (%llu) bad symlink length (%lld)", + __func__, (unsigned long long) ip->i_ino, + (long long) pathlen); + ASSERT(0); + error = -EFSCORRUPTED; + goto out; + } + + + if (ip->i_df.if_flags & XFS_IFINLINE) { + memcpy(link, ip->i_df.if_u1.if_data, pathlen); + link[pathlen] = '\0'; + } else { + error = xfs_readlink_bmap(ip, link); + } + + out: + xfs_iunlock(ip, XFS_ILOCK_SHARED); + return error; +} + +int +xfs_symlink( + struct xfs_inode *dp, + struct xfs_name *link_name, + const char *target_path, + umode_t mode, + struct xfs_inode **ipp) +{ + struct xfs_mount *mp = dp->i_mount; + struct xfs_trans *tp = NULL; + struct xfs_inode *ip = NULL; + int error = 0; + int pathlen; + struct xfs_bmap_free free_list; + xfs_fsblock_t first_block; + bool unlock_dp_on_error = false; + uint cancel_flags; + int committed; + xfs_fileoff_t first_fsb; + xfs_filblks_t fs_blocks; + int nmaps; + struct xfs_bmbt_irec mval[XFS_SYMLINK_MAPS]; + xfs_daddr_t d; + const char *cur_chunk; + int byte_cnt; + int n; + xfs_buf_t *bp; + prid_t prid; + struct xfs_dquot *udqp = NULL; + struct xfs_dquot *gdqp = NULL; + struct xfs_dquot *pdqp = NULL; + uint resblks; + + *ipp = NULL; + + trace_xfs_symlink(dp, link_name); + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + /* + * Check component lengths of the target path name. + */ + pathlen = strlen(target_path); + if (pathlen >= MAXPATHLEN) /* total string too long */ + return -ENAMETOOLONG; + + udqp = gdqp = NULL; + prid = xfs_get_initial_prid(dp); + + /* + * Make sure that we have allocated dquot(s) on disk. + */ + error = xfs_qm_vop_dqalloc(dp, + xfs_kuid_to_uid(current_fsuid()), + xfs_kgid_to_gid(current_fsgid()), prid, + XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, + &udqp, &gdqp, &pdqp); + if (error) + return error; + + tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK); + cancel_flags = XFS_TRANS_RELEASE_LOG_RES; + /* + * The symlink will fit into the inode data fork? + * There can't be any attributes so we get the whole variable part. + */ + if (pathlen <= XFS_LITINO(mp, dp->i_d.di_version)) + fs_blocks = 0; + else + fs_blocks = xfs_symlink_blocks(mp, pathlen); + resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_symlink, resblks, 0); + if (error == -ENOSPC && fs_blocks == 0) { + resblks = 0; + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_symlink, 0, 0); + } + if (error) { + cancel_flags = 0; + goto out_trans_cancel; + } + + xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); + unlock_dp_on_error = true; + + /* + * Check whether the directory allows new symlinks or not. + */ + if (dp->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) { + error = -EPERM; + goto out_trans_cancel; + } + + /* + * Reserve disk quota : blocks and inode. + */ + error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, + pdqp, resblks, 1, 0); + if (error) + goto out_trans_cancel; + + /* + * Check for ability to enter directory entry, if no space reserved. + */ + if (!resblks) { + error = xfs_dir_canenter(tp, dp, link_name); + if (error) + goto out_trans_cancel; + } + /* + * Initialize the bmap freelist prior to calling either + * bmapi or the directory create code. + */ + xfs_bmap_init(&free_list, &first_block); + + /* + * Allocate an inode for the symlink. + */ + error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT), 1, 0, + prid, resblks > 0, &ip, NULL); + if (error) + goto out_trans_cancel; + + /* + * Now we join the directory inode to the transaction. We do not do it + * earlier because xfs_dir_ialloc might commit the previous transaction + * (and release all the locks). An error from here on will result in + * the transaction cancel unlocking dp so don't do it explicitly in the + * error path. + */ + xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); + unlock_dp_on_error = false; + + /* + * Also attach the dquot(s) to it, if applicable. + */ + xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp); + + if (resblks) + resblks -= XFS_IALLOC_SPACE_RES(mp); + /* + * If the symlink will fit into the inode, write it inline. + */ + if (pathlen <= XFS_IFORK_DSIZE(ip)) { + xfs_idata_realloc(ip, pathlen, XFS_DATA_FORK); + memcpy(ip->i_df.if_u1.if_data, target_path, pathlen); + ip->i_d.di_size = pathlen; + + /* + * The inode was initially created in extent format. + */ + ip->i_df.if_flags &= ~(XFS_IFEXTENTS | XFS_IFBROOT); + ip->i_df.if_flags |= XFS_IFINLINE; + + ip->i_d.di_format = XFS_DINODE_FMT_LOCAL; + xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE); + + } else { + int offset; + + first_fsb = 0; + nmaps = XFS_SYMLINK_MAPS; + + error = xfs_bmapi_write(tp, ip, first_fsb, fs_blocks, + XFS_BMAPI_METADATA, &first_block, resblks, + mval, &nmaps, &free_list); + if (error) + goto out_bmap_cancel; + + if (resblks) + resblks -= fs_blocks; + ip->i_d.di_size = pathlen; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + cur_chunk = target_path; + offset = 0; + for (n = 0; n < nmaps; n++) { + char *buf; + + d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock); + byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); + bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, + BTOBB(byte_cnt), 0); + if (!bp) { + error = -ENOMEM; + goto out_bmap_cancel; + } + bp->b_ops = &xfs_symlink_buf_ops; + + byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt); + byte_cnt = min(byte_cnt, pathlen); + + buf = bp->b_addr; + buf += xfs_symlink_hdr_set(mp, ip->i_ino, offset, + byte_cnt, bp); + + memcpy(buf, cur_chunk, byte_cnt); + + cur_chunk += byte_cnt; + pathlen -= byte_cnt; + offset += byte_cnt; + + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SYMLINK_BUF); + xfs_trans_log_buf(tp, bp, 0, (buf + byte_cnt - 1) - + (char *)bp->b_addr); + } + ASSERT(pathlen == 0); + } + + /* + * Create the directory entry for the symlink. + */ + error = xfs_dir_createname(tp, dp, link_name, ip->i_ino, + &first_block, &free_list, resblks); + if (error) + goto out_bmap_cancel; + xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); + + /* + * If this is a synchronous mount, make sure that the + * symlink transaction goes to disk before returning to + * the user. + */ + if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) { + xfs_trans_set_sync(tp); + } + + error = xfs_bmap_finish(&tp, &free_list, &committed); + if (error) + goto out_bmap_cancel; + + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + if (error) + goto out_release_inode; + + xfs_qm_dqrele(udqp); + xfs_qm_dqrele(gdqp); + xfs_qm_dqrele(pdqp); + + *ipp = ip; + return 0; + +out_bmap_cancel: + xfs_bmap_cancel(&free_list); + cancel_flags |= XFS_TRANS_ABORT; +out_trans_cancel: + xfs_trans_cancel(tp, cancel_flags); +out_release_inode: + /* + * Wait until after the current transaction is aborted to finish the + * setup of the inode and release the inode. This prevents recursive + * transactions and deadlocks from xfs_inactive. + */ + if (ip) { + xfs_finish_inode_setup(ip); + IRELE(ip); + } + + xfs_qm_dqrele(udqp); + xfs_qm_dqrele(gdqp); + xfs_qm_dqrele(pdqp); + + if (unlock_dp_on_error) + xfs_iunlock(dp, XFS_ILOCK_EXCL); + return error; +} + +/* + * Free a symlink that has blocks associated with it. + */ +STATIC int +xfs_inactive_symlink_rmt( + struct xfs_inode *ip) +{ + xfs_buf_t *bp; + int committed; + int done; + int error; + xfs_fsblock_t first_block; + xfs_bmap_free_t free_list; + int i; + xfs_mount_t *mp; + xfs_bmbt_irec_t mval[XFS_SYMLINK_MAPS]; + int nmaps; + int size; + xfs_trans_t *tp; + + mp = ip->i_mount; + ASSERT(ip->i_df.if_flags & XFS_IFEXTENTS); + /* + * We're freeing a symlink that has some + * blocks allocated to it. Free the + * blocks here. We know that we've got + * either 1 or 2 extents and that we can + * free them all in one bunmapi call. + */ + ASSERT(ip->i_d.di_nextents > 0 && ip->i_d.di_nextents <= 2); + + tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); + if (error) { + xfs_trans_cancel(tp, 0); + return error; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + + /* + * Lock the inode, fix the size, and join it to the transaction. + * Hold it so in the normal path, we still have it locked for + * the second transaction. In the error paths we need it + * held so the cancel won't rele it, see below. + */ + size = (int)ip->i_d.di_size; + ip->i_d.di_size = 0; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + /* + * Find the block(s) so we can inval and unmap them. + */ + done = 0; + xfs_bmap_init(&free_list, &first_block); + nmaps = ARRAY_SIZE(mval); + error = xfs_bmapi_read(ip, 0, xfs_symlink_blocks(mp, size), + mval, &nmaps, 0); + if (error) + goto error_trans_cancel; + /* + * Invalidate the block(s). No validation is done. + */ + for (i = 0; i < nmaps; i++) { + bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, + XFS_FSB_TO_DADDR(mp, mval[i].br_startblock), + XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0); + if (!bp) { + error = -ENOMEM; + goto error_bmap_cancel; + } + xfs_trans_binval(tp, bp); + } + /* + * Unmap the dead block(s) to the free_list. + */ + error = xfs_bunmapi(tp, ip, 0, size, XFS_BMAPI_METADATA, nmaps, + &first_block, &free_list, &done); + if (error) + goto error_bmap_cancel; + ASSERT(done); + /* + * Commit the first transaction. This logs the EFI and the inode. + */ + error = xfs_bmap_finish(&tp, &free_list, &committed); + if (error) + goto error_bmap_cancel; + /* + * The transaction must have been committed, since there were + * actually extents freed by xfs_bunmapi. See xfs_bmap_finish. + * The new tp has the extent freeing and EFDs. + */ + ASSERT(committed); + /* + * The first xact was committed, so add the inode to the new one. + * Mark it dirty so it will be logged and moved forward in the log as + * part of every commit. + */ + xfs_trans_ijoin(tp, ip, 0); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + /* + * Commit the transaction containing extent freeing and EFDs. + */ + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + if (error) { + ASSERT(XFS_FORCED_SHUTDOWN(mp)); + goto error_unlock; + } + + /* + * Remove the memory for extent descriptions (just bookkeeping). + */ + if (ip->i_df.if_bytes) + xfs_idata_realloc(ip, -ip->i_df.if_bytes, XFS_DATA_FORK); + ASSERT(ip->i_df.if_bytes == 0); + + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return 0; + +error_bmap_cancel: + xfs_bmap_cancel(&free_list); +error_trans_cancel: + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); +error_unlock: + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; +} + +/* + * xfs_inactive_symlink - free a symlink + */ +int +xfs_inactive_symlink( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + int pathlen; + + trace_xfs_inactive_symlink(ip); + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + + /* + * Zero length symlinks _can_ exist. + */ + pathlen = (int)ip->i_d.di_size; + if (!pathlen) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return 0; + } + + if (pathlen < 0 || pathlen > MAXPATHLEN) { + xfs_alert(mp, "%s: inode (0x%llx) bad symlink length (%d)", + __func__, (unsigned long long)ip->i_ino, pathlen); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + ASSERT(0); + return -EFSCORRUPTED; + } + + if (ip->i_df.if_flags & XFS_IFINLINE) { + if (ip->i_df.if_bytes > 0) + xfs_idata_realloc(ip, -(ip->i_df.if_bytes), + XFS_DATA_FORK); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + ASSERT(ip->i_df.if_bytes == 0); + return 0; + } + + xfs_iunlock(ip, XFS_ILOCK_EXCL); + + /* remove the remote symlink */ + return xfs_inactive_symlink_rmt(ip); +} diff --git a/kernel/fs/xfs/xfs_symlink.h b/kernel/fs/xfs/xfs_symlink.h new file mode 100644 index 000000000..e75245d09 --- /dev/null +++ b/kernel/fs/xfs/xfs_symlink.h @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2012 Red Hat, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_SYMLINK_H +#define __XFS_SYMLINK_H 1 + +/* Kernel only symlink defintions */ + +int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name, + const char *target_path, umode_t mode, struct xfs_inode **ipp); +int xfs_readlink(struct xfs_inode *ip, char *link); +int xfs_inactive_symlink(struct xfs_inode *ip); + +#endif /* __XFS_SYMLINK_H */ diff --git a/kernel/fs/xfs/xfs_sysctl.c b/kernel/fs/xfs/xfs_sysctl.c new file mode 100644 index 000000000..a0c8067ce --- /dev/null +++ b/kernel/fs/xfs/xfs_sysctl.c @@ -0,0 +1,243 @@ +/* + * Copyright (c) 2001-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include +#include +#include "xfs_error.h" + +static struct ctl_table_header *xfs_table_header; + +#ifdef CONFIG_PROC_FS +STATIC int +xfs_stats_clear_proc_handler( + struct ctl_table *ctl, + int write, + void __user *buffer, + size_t *lenp, + loff_t *ppos) +{ + int c, ret, *valp = ctl->data; + __uint32_t vn_active; + + ret = proc_dointvec_minmax(ctl, write, buffer, lenp, ppos); + + if (!ret && write && *valp) { + xfs_notice(NULL, "Clearing xfsstats"); + for_each_possible_cpu(c) { + preempt_disable(); + /* save vn_active, it's a universal truth! */ + vn_active = per_cpu(xfsstats, c).vn_active; + memset(&per_cpu(xfsstats, c), 0, + sizeof(struct xfsstats)); + per_cpu(xfsstats, c).vn_active = vn_active; + preempt_enable(); + } + xfs_stats_clear = 0; + } + + return ret; +} + +STATIC int +xfs_panic_mask_proc_handler( + struct ctl_table *ctl, + int write, + void __user *buffer, + size_t *lenp, + loff_t *ppos) +{ + int ret, *valp = ctl->data; + + ret = proc_dointvec_minmax(ctl, write, buffer, lenp, ppos); + if (!ret && write) { + xfs_panic_mask = *valp; +#ifdef DEBUG + xfs_panic_mask |= (XFS_PTAG_SHUTDOWN_CORRUPT | XFS_PTAG_LOGRES); +#endif + } + return ret; +} +#endif /* CONFIG_PROC_FS */ + +static struct ctl_table xfs_table[] = { + { + .procname = "irix_sgid_inherit", + .data = &xfs_params.sgid_inherit.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &xfs_params.sgid_inherit.min, + .extra2 = &xfs_params.sgid_inherit.max + }, + { + .procname = "irix_symlink_mode", + .data = &xfs_params.symlink_mode.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &xfs_params.symlink_mode.min, + .extra2 = &xfs_params.symlink_mode.max + }, + { + .procname = "panic_mask", + .data = &xfs_params.panic_mask.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = xfs_panic_mask_proc_handler, + .extra1 = &xfs_params.panic_mask.min, + .extra2 = &xfs_params.panic_mask.max + }, + + { + .procname = "error_level", + .data = &xfs_params.error_level.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &xfs_params.error_level.min, + .extra2 = &xfs_params.error_level.max + }, + { + .procname = "xfssyncd_centisecs", + .data = &xfs_params.syncd_timer.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &xfs_params.syncd_timer.min, + .extra2 = &xfs_params.syncd_timer.max + }, + { + .procname = "inherit_sync", + .data = &xfs_params.inherit_sync.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &xfs_params.inherit_sync.min, + .extra2 = &xfs_params.inherit_sync.max + }, + { + .procname = "inherit_nodump", + .data = &xfs_params.inherit_nodump.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &xfs_params.inherit_nodump.min, + .extra2 = &xfs_params.inherit_nodump.max + }, + { + .procname = "inherit_noatime", + .data = &xfs_params.inherit_noatim.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &xfs_params.inherit_noatim.min, + .extra2 = &xfs_params.inherit_noatim.max + }, + { + .procname = "inherit_nosymlinks", + .data = &xfs_params.inherit_nosym.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &xfs_params.inherit_nosym.min, + .extra2 = &xfs_params.inherit_nosym.max + }, + { + .procname = "rotorstep", + .data = &xfs_params.rotorstep.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &xfs_params.rotorstep.min, + .extra2 = &xfs_params.rotorstep.max + }, + { + .procname = "inherit_nodefrag", + .data = &xfs_params.inherit_nodfrg.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &xfs_params.inherit_nodfrg.min, + .extra2 = &xfs_params.inherit_nodfrg.max + }, + { + .procname = "filestream_centisecs", + .data = &xfs_params.fstrm_timer.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &xfs_params.fstrm_timer.min, + .extra2 = &xfs_params.fstrm_timer.max, + }, + { + .procname = "speculative_prealloc_lifetime", + .data = &xfs_params.eofb_timer.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &xfs_params.eofb_timer.min, + .extra2 = &xfs_params.eofb_timer.max, + }, + /* please keep this the last entry */ +#ifdef CONFIG_PROC_FS + { + .procname = "stats_clear", + .data = &xfs_params.stats_clear.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = xfs_stats_clear_proc_handler, + .extra1 = &xfs_params.stats_clear.min, + .extra2 = &xfs_params.stats_clear.max + }, +#endif /* CONFIG_PROC_FS */ + + {} +}; + +static struct ctl_table xfs_dir_table[] = { + { + .procname = "xfs", + .mode = 0555, + .child = xfs_table + }, + {} +}; + +static struct ctl_table xfs_root_table[] = { + { + .procname = "fs", + .mode = 0555, + .child = xfs_dir_table + }, + {} +}; + +int +xfs_sysctl_register(void) +{ + xfs_table_header = register_sysctl_table(xfs_root_table); + if (!xfs_table_header) + return -ENOMEM; + return 0; +} + +void +xfs_sysctl_unregister(void) +{ + unregister_sysctl_table(xfs_table_header); +} diff --git a/kernel/fs/xfs/xfs_sysctl.h b/kernel/fs/xfs/xfs_sysctl.h new file mode 100644 index 000000000..ffef45375 --- /dev/null +++ b/kernel/fs/xfs/xfs_sysctl.h @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2001-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_SYSCTL_H__ +#define __XFS_SYSCTL_H__ + +#include + +/* + * Tunable xfs parameters + */ + +typedef struct xfs_sysctl_val { + int min; + int val; + int max; +} xfs_sysctl_val_t; + +typedef struct xfs_param { + xfs_sysctl_val_t sgid_inherit; /* Inherit S_ISGID if process' GID is + * not a member of parent dir GID. */ + xfs_sysctl_val_t symlink_mode; /* Link creat mode affected by umask */ + xfs_sysctl_val_t panic_mask; /* bitmask to cause panic on errors. */ + xfs_sysctl_val_t error_level; /* Degree of reporting for problems */ + xfs_sysctl_val_t syncd_timer; /* Interval between xfssyncd wakeups */ + xfs_sysctl_val_t stats_clear; /* Reset all XFS statistics to zero. */ + xfs_sysctl_val_t inherit_sync; /* Inherit the "sync" inode flag. */ + xfs_sysctl_val_t inherit_nodump;/* Inherit the "nodump" inode flag. */ + xfs_sysctl_val_t inherit_noatim;/* Inherit the "noatime" inode flag. */ + xfs_sysctl_val_t xfs_buf_timer; /* Interval between xfsbufd wakeups. */ + xfs_sysctl_val_t xfs_buf_age; /* Metadata buffer age before flush. */ + xfs_sysctl_val_t inherit_nosym; /* Inherit the "nosymlinks" flag. */ + xfs_sysctl_val_t rotorstep; /* inode32 AG rotoring control knob */ + xfs_sysctl_val_t inherit_nodfrg;/* Inherit the "nodefrag" inode flag. */ + xfs_sysctl_val_t fstrm_timer; /* Filestream dir-AG assoc'n timeout. */ + xfs_sysctl_val_t eofb_timer; /* Interval between eofb scan wakeups */ +} xfs_param_t; + +/* + * xfs_error_level: + * + * How much error reporting will be done when internal problems are + * encountered. These problems normally return an EFSCORRUPTED to their + * caller, with no other information reported. + * + * 0 No error reports + * 1 Report EFSCORRUPTED errors that will cause a filesystem shutdown + * 5 Report all EFSCORRUPTED errors (all of the above errors, plus any + * additional errors that are known to not cause shutdowns) + * + * xfs_panic_mask bit 0x8 turns the error reports into panics + */ + +enum { + /* XFS_REFCACHE_SIZE = 1 */ + /* XFS_REFCACHE_PURGE = 2 */ + /* XFS_RESTRICT_CHOWN = 3 */ + XFS_SGID_INHERIT = 4, + XFS_SYMLINK_MODE = 5, + XFS_PANIC_MASK = 6, + XFS_ERRLEVEL = 7, + XFS_SYNCD_TIMER = 8, + /* XFS_PROBE_DMAPI = 9 */ + /* XFS_PROBE_IOOPS = 10 */ + /* XFS_PROBE_QUOTA = 11 */ + XFS_STATS_CLEAR = 12, + XFS_INHERIT_SYNC = 13, + XFS_INHERIT_NODUMP = 14, + XFS_INHERIT_NOATIME = 15, + XFS_BUF_TIMER = 16, + XFS_BUF_AGE = 17, + /* XFS_IO_BYPASS = 18 */ + XFS_INHERIT_NOSYM = 19, + XFS_ROTORSTEP = 20, + XFS_INHERIT_NODFRG = 21, + XFS_FILESTREAM_TIMER = 22, +}; + +extern xfs_param_t xfs_params; + +struct xfs_globals { + int log_recovery_delay; /* log recovery delay (secs) */ +}; +extern struct xfs_globals xfs_globals; + +#ifdef CONFIG_SYSCTL +extern int xfs_sysctl_register(void); +extern void xfs_sysctl_unregister(void); +#else +# define xfs_sysctl_register() (0) +# define xfs_sysctl_unregister() do { } while (0) +#endif /* CONFIG_SYSCTL */ + +#endif /* __XFS_SYSCTL_H__ */ diff --git a/kernel/fs/xfs/xfs_sysfs.c b/kernel/fs/xfs/xfs_sysfs.c new file mode 100644 index 000000000..aa0367085 --- /dev/null +++ b/kernel/fs/xfs/xfs_sysfs.c @@ -0,0 +1,239 @@ +/* + * Copyright (c) 2014 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "xfs.h" +#include "xfs_sysfs.h" +#include "xfs_log_format.h" +#include "xfs_log.h" +#include "xfs_log_priv.h" + +struct xfs_sysfs_attr { + struct attribute attr; + ssize_t (*show)(char *buf, void *data); + ssize_t (*store)(const char *buf, size_t count, void *data); +}; + +static inline struct xfs_sysfs_attr * +to_attr(struct attribute *attr) +{ + return container_of(attr, struct xfs_sysfs_attr, attr); +} + +#define XFS_SYSFS_ATTR_RW(name) \ + static struct xfs_sysfs_attr xfs_sysfs_attr_##name = __ATTR_RW(name) +#define XFS_SYSFS_ATTR_RO(name) \ + static struct xfs_sysfs_attr xfs_sysfs_attr_##name = __ATTR_RO(name) + +#define ATTR_LIST(name) &xfs_sysfs_attr_##name.attr + +/* + * xfs_mount kobject. This currently has no attributes and thus no need for show + * and store helpers. The mp kobject serves as the per-mount parent object that + * is identified by the fsname under sysfs. + */ + +struct kobj_type xfs_mp_ktype = { + .release = xfs_sysfs_release, +}; + +#ifdef DEBUG +/* debug */ + +STATIC ssize_t +log_recovery_delay_store( + const char *buf, + size_t count, + void *data) +{ + int ret; + int val; + + ret = kstrtoint(buf, 0, &val); + if (ret) + return ret; + + if (val < 0 || val > 60) + return -EINVAL; + + xfs_globals.log_recovery_delay = val; + + return count; +} + +STATIC ssize_t +log_recovery_delay_show( + char *buf, + void *data) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.log_recovery_delay); +} +XFS_SYSFS_ATTR_RW(log_recovery_delay); + +static struct attribute *xfs_dbg_attrs[] = { + ATTR_LIST(log_recovery_delay), + NULL, +}; + +STATIC ssize_t +xfs_dbg_show( + struct kobject *kobject, + struct attribute *attr, + char *buf) +{ + struct xfs_sysfs_attr *xfs_attr = to_attr(attr); + + return xfs_attr->show ? xfs_attr->show(buf, NULL) : 0; +} + +STATIC ssize_t +xfs_dbg_store( + struct kobject *kobject, + struct attribute *attr, + const char *buf, + size_t count) +{ + struct xfs_sysfs_attr *xfs_attr = to_attr(attr); + + return xfs_attr->store ? xfs_attr->store(buf, count, NULL) : 0; +} + +static struct sysfs_ops xfs_dbg_ops = { + .show = xfs_dbg_show, + .store = xfs_dbg_store, +}; + +struct kobj_type xfs_dbg_ktype = { + .release = xfs_sysfs_release, + .sysfs_ops = &xfs_dbg_ops, + .default_attrs = xfs_dbg_attrs, +}; + +#endif /* DEBUG */ + +/* xlog */ + +STATIC ssize_t +log_head_lsn_show( + char *buf, + void *data) +{ + struct xlog *log = data; + int cycle; + int block; + + spin_lock(&log->l_icloglock); + cycle = log->l_curr_cycle; + block = log->l_curr_block; + spin_unlock(&log->l_icloglock); + + return snprintf(buf, PAGE_SIZE, "%d:%d\n", cycle, block); +} +XFS_SYSFS_ATTR_RO(log_head_lsn); + +STATIC ssize_t +log_tail_lsn_show( + char *buf, + void *data) +{ + struct xlog *log = data; + int cycle; + int block; + + xlog_crack_atomic_lsn(&log->l_tail_lsn, &cycle, &block); + return snprintf(buf, PAGE_SIZE, "%d:%d\n", cycle, block); +} +XFS_SYSFS_ATTR_RO(log_tail_lsn); + +STATIC ssize_t +reserve_grant_head_show( + char *buf, + void *data) +{ + struct xlog *log = data; + int cycle; + int bytes; + + xlog_crack_grant_head(&log->l_reserve_head.grant, &cycle, &bytes); + return snprintf(buf, PAGE_SIZE, "%d:%d\n", cycle, bytes); +} +XFS_SYSFS_ATTR_RO(reserve_grant_head); + +STATIC ssize_t +write_grant_head_show( + char *buf, + void *data) +{ + struct xlog *log = data; + int cycle; + int bytes; + + xlog_crack_grant_head(&log->l_write_head.grant, &cycle, &bytes); + return snprintf(buf, PAGE_SIZE, "%d:%d\n", cycle, bytes); +} +XFS_SYSFS_ATTR_RO(write_grant_head); + +static struct attribute *xfs_log_attrs[] = { + ATTR_LIST(log_head_lsn), + ATTR_LIST(log_tail_lsn), + ATTR_LIST(reserve_grant_head), + ATTR_LIST(write_grant_head), + NULL, +}; + +static inline struct xlog * +to_xlog(struct kobject *kobject) +{ + struct xfs_kobj *kobj = to_kobj(kobject); + return container_of(kobj, struct xlog, l_kobj); +} + +STATIC ssize_t +xfs_log_show( + struct kobject *kobject, + struct attribute *attr, + char *buf) +{ + struct xlog *log = to_xlog(kobject); + struct xfs_sysfs_attr *xfs_attr = to_attr(attr); + + return xfs_attr->show ? xfs_attr->show(buf, log) : 0; +} + +STATIC ssize_t +xfs_log_store( + struct kobject *kobject, + struct attribute *attr, + const char *buf, + size_t count) +{ + struct xlog *log = to_xlog(kobject); + struct xfs_sysfs_attr *xfs_attr = to_attr(attr); + + return xfs_attr->store ? xfs_attr->store(buf, count, log) : 0; +} + +static struct sysfs_ops xfs_log_ops = { + .show = xfs_log_show, + .store = xfs_log_store, +}; + +struct kobj_type xfs_log_ktype = { + .release = xfs_sysfs_release, + .sysfs_ops = &xfs_log_ops, + .default_attrs = xfs_log_attrs, +}; diff --git a/kernel/fs/xfs/xfs_sysfs.h b/kernel/fs/xfs/xfs_sysfs.h new file mode 100644 index 000000000..240eee35f --- /dev/null +++ b/kernel/fs/xfs/xfs_sysfs.h @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2014 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef __XFS_SYSFS_H__ +#define __XFS_SYSFS_H__ + +extern struct kobj_type xfs_mp_ktype; /* xfs_mount */ +extern struct kobj_type xfs_dbg_ktype; /* debug */ +extern struct kobj_type xfs_log_ktype; /* xlog */ + +static inline struct xfs_kobj * +to_kobj(struct kobject *kobject) +{ + return container_of(kobject, struct xfs_kobj, kobject); +} + +static inline void +xfs_sysfs_release(struct kobject *kobject) +{ + struct xfs_kobj *kobj = to_kobj(kobject); + complete(&kobj->complete); +} + +static inline int +xfs_sysfs_init( + struct xfs_kobj *kobj, + struct kobj_type *ktype, + struct xfs_kobj *parent_kobj, + const char *name) +{ + init_completion(&kobj->complete); + return kobject_init_and_add(&kobj->kobject, ktype, + &parent_kobj->kobject, "%s", name); +} + +static inline void +xfs_sysfs_del( + struct xfs_kobj *kobj) +{ + kobject_del(&kobj->kobject); + kobject_put(&kobj->kobject); + wait_for_completion(&kobj->complete); +} + +#endif /* __XFS_SYSFS_H__ */ diff --git a/kernel/fs/xfs/xfs_trace.c b/kernel/fs/xfs/xfs_trace.c new file mode 100644 index 000000000..13a029806 --- /dev/null +++ b/kernel/fs/xfs/xfs_trace.c @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2009, Christoph Hellwig + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_inode.h" +#include "xfs_btree.h" +#include "xfs_da_btree.h" +#include "xfs_ialloc.h" +#include "xfs_itable.h" +#include "xfs_alloc.h" +#include "xfs_bmap.h" +#include "xfs_attr.h" +#include "xfs_attr_leaf.h" +#include "xfs_trans.h" +#include "xfs_log.h" +#include "xfs_log_priv.h" +#include "xfs_buf_item.h" +#include "xfs_quota.h" +#include "xfs_iomap.h" +#include "xfs_aops.h" +#include "xfs_dquot_item.h" +#include "xfs_dquot.h" +#include "xfs_log_recover.h" +#include "xfs_inode_item.h" +#include "xfs_bmap_btree.h" +#include "xfs_filestream.h" + +/* + * We include this last to have the helpers above available for the trace + * event implementations. + */ +#define CREATE_TRACE_POINTS +#include "xfs_trace.h" diff --git a/kernel/fs/xfs/xfs_trace.h b/kernel/fs/xfs/xfs_trace.h new file mode 100644 index 000000000..615781bf4 --- /dev/null +++ b/kernel/fs/xfs/xfs_trace.h @@ -0,0 +1,2083 @@ +/* + * Copyright (c) 2009, Christoph Hellwig + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM xfs + +#if !defined(_TRACE_XFS_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_XFS_H + +#include + +struct xfs_agf; +struct xfs_alloc_arg; +struct xfs_attr_list_context; +struct xfs_buf_log_item; +struct xfs_da_args; +struct xfs_da_node_entry; +struct xfs_dquot; +struct xfs_log_item; +struct xlog; +struct xlog_ticket; +struct xlog_recover; +struct xlog_recover_item; +struct xfs_buf_log_format; +struct xfs_inode_log_format; +struct xfs_bmbt_irec; + +DECLARE_EVENT_CLASS(xfs_attr_list_class, + TP_PROTO(struct xfs_attr_list_context *ctx), + TP_ARGS(ctx), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(u32, hashval) + __field(u32, blkno) + __field(u32, offset) + __field(void *, alist) + __field(int, bufsize) + __field(int, count) + __field(int, firstu) + __field(int, dupcnt) + __field(int, flags) + ), + TP_fast_assign( + __entry->dev = VFS_I(ctx->dp)->i_sb->s_dev; + __entry->ino = ctx->dp->i_ino; + __entry->hashval = ctx->cursor->hashval; + __entry->blkno = ctx->cursor->blkno; + __entry->offset = ctx->cursor->offset; + __entry->alist = ctx->alist; + __entry->bufsize = ctx->bufsize; + __entry->count = ctx->count; + __entry->firstu = ctx->firstu; + __entry->flags = ctx->flags; + ), + TP_printk("dev %d:%d ino 0x%llx cursor h/b/o 0x%x/0x%x/%u dupcnt %u " + "alist 0x%p size %u count %u firstu %u flags %d %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->hashval, + __entry->blkno, + __entry->offset, + __entry->dupcnt, + __entry->alist, + __entry->bufsize, + __entry->count, + __entry->firstu, + __entry->flags, + __print_flags(__entry->flags, "|", XFS_ATTR_FLAGS) + ) +) + +#define DEFINE_ATTR_LIST_EVENT(name) \ +DEFINE_EVENT(xfs_attr_list_class, name, \ + TP_PROTO(struct xfs_attr_list_context *ctx), \ + TP_ARGS(ctx)) +DEFINE_ATTR_LIST_EVENT(xfs_attr_list_sf); +DEFINE_ATTR_LIST_EVENT(xfs_attr_list_sf_all); +DEFINE_ATTR_LIST_EVENT(xfs_attr_list_leaf); +DEFINE_ATTR_LIST_EVENT(xfs_attr_list_leaf_end); +DEFINE_ATTR_LIST_EVENT(xfs_attr_list_full); +DEFINE_ATTR_LIST_EVENT(xfs_attr_list_add); +DEFINE_ATTR_LIST_EVENT(xfs_attr_list_wrong_blk); +DEFINE_ATTR_LIST_EVENT(xfs_attr_list_notfound); +DEFINE_ATTR_LIST_EVENT(xfs_attr_leaf_list); +DEFINE_ATTR_LIST_EVENT(xfs_attr_node_list); + +DECLARE_EVENT_CLASS(xfs_perag_class, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount, + unsigned long caller_ip), + TP_ARGS(mp, agno, refcount, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(int, refcount) + __field(unsigned long, caller_ip) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->refcount = refcount; + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d agno %u refcount %d caller %ps", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->refcount, + (char *)__entry->caller_ip) +); + +#define DEFINE_PERAG_REF_EVENT(name) \ +DEFINE_EVENT(xfs_perag_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount, \ + unsigned long caller_ip), \ + TP_ARGS(mp, agno, refcount, caller_ip)) +DEFINE_PERAG_REF_EVENT(xfs_perag_get); +DEFINE_PERAG_REF_EVENT(xfs_perag_get_tag); +DEFINE_PERAG_REF_EVENT(xfs_perag_put); +DEFINE_PERAG_REF_EVENT(xfs_perag_set_reclaim); +DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim); +DEFINE_PERAG_REF_EVENT(xfs_perag_set_eofblocks); +DEFINE_PERAG_REF_EVENT(xfs_perag_clear_eofblocks); + +DECLARE_EVENT_CLASS(xfs_ag_class, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno), + TP_ARGS(mp, agno), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + ), + TP_printk("dev %d:%d agno %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno) +); +#define DEFINE_AG_EVENT(name) \ +DEFINE_EVENT(xfs_ag_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno), \ + TP_ARGS(mp, agno)) + +DEFINE_AG_EVENT(xfs_read_agf); +DEFINE_AG_EVENT(xfs_alloc_read_agf); +DEFINE_AG_EVENT(xfs_read_agi); +DEFINE_AG_EVENT(xfs_ialloc_read_agi); + +TRACE_EVENT(xfs_attr_list_node_descend, + TP_PROTO(struct xfs_attr_list_context *ctx, + struct xfs_da_node_entry *btree), + TP_ARGS(ctx, btree), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(u32, hashval) + __field(u32, blkno) + __field(u32, offset) + __field(void *, alist) + __field(int, bufsize) + __field(int, count) + __field(int, firstu) + __field(int, dupcnt) + __field(int, flags) + __field(u32, bt_hashval) + __field(u32, bt_before) + ), + TP_fast_assign( + __entry->dev = VFS_I(ctx->dp)->i_sb->s_dev; + __entry->ino = ctx->dp->i_ino; + __entry->hashval = ctx->cursor->hashval; + __entry->blkno = ctx->cursor->blkno; + __entry->offset = ctx->cursor->offset; + __entry->alist = ctx->alist; + __entry->bufsize = ctx->bufsize; + __entry->count = ctx->count; + __entry->firstu = ctx->firstu; + __entry->flags = ctx->flags; + __entry->bt_hashval = be32_to_cpu(btree->hashval); + __entry->bt_before = be32_to_cpu(btree->before); + ), + TP_printk("dev %d:%d ino 0x%llx cursor h/b/o 0x%x/0x%x/%u dupcnt %u " + "alist 0x%p size %u count %u firstu %u flags %d %s " + "node hashval %u, node before %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->hashval, + __entry->blkno, + __entry->offset, + __entry->dupcnt, + __entry->alist, + __entry->bufsize, + __entry->count, + __entry->firstu, + __entry->flags, + __print_flags(__entry->flags, "|", XFS_ATTR_FLAGS), + __entry->bt_hashval, + __entry->bt_before) +); + +TRACE_EVENT(xfs_iext_insert, + TP_PROTO(struct xfs_inode *ip, xfs_extnum_t idx, + struct xfs_bmbt_irec *r, int state, unsigned long caller_ip), + TP_ARGS(ip, idx, r, state, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_extnum_t, idx) + __field(xfs_fileoff_t, startoff) + __field(xfs_fsblock_t, startblock) + __field(xfs_filblks_t, blockcount) + __field(xfs_exntst_t, state) + __field(int, bmap_state) + __field(unsigned long, caller_ip) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->idx = idx; + __entry->startoff = r->br_startoff; + __entry->startblock = r->br_startblock; + __entry->blockcount = r->br_blockcount; + __entry->state = r->br_state; + __entry->bmap_state = state; + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d ino 0x%llx state %s idx %ld " + "offset %lld block %lld count %lld flag %d caller %ps", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS), + (long)__entry->idx, + __entry->startoff, + (__int64_t)__entry->startblock, + __entry->blockcount, + __entry->state, + (char *)__entry->caller_ip) +); + +DECLARE_EVENT_CLASS(xfs_bmap_class, + TP_PROTO(struct xfs_inode *ip, xfs_extnum_t idx, int state, + unsigned long caller_ip), + TP_ARGS(ip, idx, state, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_extnum_t, idx) + __field(xfs_fileoff_t, startoff) + __field(xfs_fsblock_t, startblock) + __field(xfs_filblks_t, blockcount) + __field(xfs_exntst_t, state) + __field(int, bmap_state) + __field(unsigned long, caller_ip) + ), + TP_fast_assign( + struct xfs_ifork *ifp = (state & BMAP_ATTRFORK) ? + ip->i_afp : &ip->i_df; + struct xfs_bmbt_irec r; + + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), &r); + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->idx = idx; + __entry->startoff = r.br_startoff; + __entry->startblock = r.br_startblock; + __entry->blockcount = r.br_blockcount; + __entry->state = r.br_state; + __entry->bmap_state = state; + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d ino 0x%llx state %s idx %ld " + "offset %lld block %lld count %lld flag %d caller %ps", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS), + (long)__entry->idx, + __entry->startoff, + (__int64_t)__entry->startblock, + __entry->blockcount, + __entry->state, + (char *)__entry->caller_ip) +) + +#define DEFINE_BMAP_EVENT(name) \ +DEFINE_EVENT(xfs_bmap_class, name, \ + TP_PROTO(struct xfs_inode *ip, xfs_extnum_t idx, int state, \ + unsigned long caller_ip), \ + TP_ARGS(ip, idx, state, caller_ip)) +DEFINE_BMAP_EVENT(xfs_iext_remove); +DEFINE_BMAP_EVENT(xfs_bmap_pre_update); +DEFINE_BMAP_EVENT(xfs_bmap_post_update); +DEFINE_BMAP_EVENT(xfs_extlist); + +DECLARE_EVENT_CLASS(xfs_buf_class, + TP_PROTO(struct xfs_buf *bp, unsigned long caller_ip), + TP_ARGS(bp, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_daddr_t, bno) + __field(int, nblks) + __field(int, hold) + __field(int, pincount) + __field(unsigned, lockval) + __field(unsigned, flags) + __field(unsigned long, caller_ip) + ), + TP_fast_assign( + __entry->dev = bp->b_target->bt_dev; + __entry->bno = bp->b_bn; + __entry->nblks = bp->b_length; + __entry->hold = atomic_read(&bp->b_hold); + __entry->pincount = atomic_read(&bp->b_pin_count); + __entry->lockval = bp->b_sema.count; + __entry->flags = bp->b_flags; + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d bno 0x%llx nblks 0x%x hold %d pincount %d " + "lock %d flags %s caller %ps", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->bno, + __entry->nblks, + __entry->hold, + __entry->pincount, + __entry->lockval, + __print_flags(__entry->flags, "|", XFS_BUF_FLAGS), + (void *)__entry->caller_ip) +) + +#define DEFINE_BUF_EVENT(name) \ +DEFINE_EVENT(xfs_buf_class, name, \ + TP_PROTO(struct xfs_buf *bp, unsigned long caller_ip), \ + TP_ARGS(bp, caller_ip)) +DEFINE_BUF_EVENT(xfs_buf_init); +DEFINE_BUF_EVENT(xfs_buf_free); +DEFINE_BUF_EVENT(xfs_buf_hold); +DEFINE_BUF_EVENT(xfs_buf_rele); +DEFINE_BUF_EVENT(xfs_buf_iodone); +DEFINE_BUF_EVENT(xfs_buf_submit); +DEFINE_BUF_EVENT(xfs_buf_submit_wait); +DEFINE_BUF_EVENT(xfs_buf_bawrite); +DEFINE_BUF_EVENT(xfs_buf_lock); +DEFINE_BUF_EVENT(xfs_buf_lock_done); +DEFINE_BUF_EVENT(xfs_buf_trylock); +DEFINE_BUF_EVENT(xfs_buf_unlock); +DEFINE_BUF_EVENT(xfs_buf_iowait); +DEFINE_BUF_EVENT(xfs_buf_iowait_done); +DEFINE_BUF_EVENT(xfs_buf_delwri_queue); +DEFINE_BUF_EVENT(xfs_buf_delwri_queued); +DEFINE_BUF_EVENT(xfs_buf_delwri_split); +DEFINE_BUF_EVENT(xfs_buf_get_uncached); +DEFINE_BUF_EVENT(xfs_bdstrat_shut); +DEFINE_BUF_EVENT(xfs_buf_item_relse); +DEFINE_BUF_EVENT(xfs_buf_item_iodone); +DEFINE_BUF_EVENT(xfs_buf_item_iodone_async); +DEFINE_BUF_EVENT(xfs_buf_error_relse); +DEFINE_BUF_EVENT(xfs_buf_wait_buftarg); +DEFINE_BUF_EVENT(xfs_trans_read_buf_io); +DEFINE_BUF_EVENT(xfs_trans_read_buf_shut); + +/* not really buffer traces, but the buf provides useful information */ +DEFINE_BUF_EVENT(xfs_btree_corrupt); +DEFINE_BUF_EVENT(xfs_da_btree_corrupt); +DEFINE_BUF_EVENT(xfs_reset_dqcounts); +DEFINE_BUF_EVENT(xfs_inode_item_push); + +/* pass flags explicitly */ +DECLARE_EVENT_CLASS(xfs_buf_flags_class, + TP_PROTO(struct xfs_buf *bp, unsigned flags, unsigned long caller_ip), + TP_ARGS(bp, flags, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_daddr_t, bno) + __field(size_t, buffer_length) + __field(int, hold) + __field(int, pincount) + __field(unsigned, lockval) + __field(unsigned, flags) + __field(unsigned long, caller_ip) + ), + TP_fast_assign( + __entry->dev = bp->b_target->bt_dev; + __entry->bno = bp->b_bn; + __entry->buffer_length = BBTOB(bp->b_length); + __entry->flags = flags; + __entry->hold = atomic_read(&bp->b_hold); + __entry->pincount = atomic_read(&bp->b_pin_count); + __entry->lockval = bp->b_sema.count; + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d " + "lock %d flags %s caller %ps", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->bno, + __entry->buffer_length, + __entry->hold, + __entry->pincount, + __entry->lockval, + __print_flags(__entry->flags, "|", XFS_BUF_FLAGS), + (void *)__entry->caller_ip) +) + +#define DEFINE_BUF_FLAGS_EVENT(name) \ +DEFINE_EVENT(xfs_buf_flags_class, name, \ + TP_PROTO(struct xfs_buf *bp, unsigned flags, unsigned long caller_ip), \ + TP_ARGS(bp, flags, caller_ip)) +DEFINE_BUF_FLAGS_EVENT(xfs_buf_find); +DEFINE_BUF_FLAGS_EVENT(xfs_buf_get); +DEFINE_BUF_FLAGS_EVENT(xfs_buf_read); + +TRACE_EVENT(xfs_buf_ioerror, + TP_PROTO(struct xfs_buf *bp, int error, unsigned long caller_ip), + TP_ARGS(bp, error, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_daddr_t, bno) + __field(size_t, buffer_length) + __field(unsigned, flags) + __field(int, hold) + __field(int, pincount) + __field(unsigned, lockval) + __field(int, error) + __field(unsigned long, caller_ip) + ), + TP_fast_assign( + __entry->dev = bp->b_target->bt_dev; + __entry->bno = bp->b_bn; + __entry->buffer_length = BBTOB(bp->b_length); + __entry->hold = atomic_read(&bp->b_hold); + __entry->pincount = atomic_read(&bp->b_pin_count); + __entry->lockval = bp->b_sema.count; + __entry->error = error; + __entry->flags = bp->b_flags; + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d " + "lock %d error %d flags %s caller %ps", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->bno, + __entry->buffer_length, + __entry->hold, + __entry->pincount, + __entry->lockval, + __entry->error, + __print_flags(__entry->flags, "|", XFS_BUF_FLAGS), + (void *)__entry->caller_ip) +); + +DECLARE_EVENT_CLASS(xfs_buf_item_class, + TP_PROTO(struct xfs_buf_log_item *bip), + TP_ARGS(bip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_daddr_t, buf_bno) + __field(size_t, buf_len) + __field(int, buf_hold) + __field(int, buf_pincount) + __field(int, buf_lockval) + __field(unsigned, buf_flags) + __field(unsigned, bli_recur) + __field(int, bli_refcount) + __field(unsigned, bli_flags) + __field(void *, li_desc) + __field(unsigned, li_flags) + ), + TP_fast_assign( + __entry->dev = bip->bli_buf->b_target->bt_dev; + __entry->bli_flags = bip->bli_flags; + __entry->bli_recur = bip->bli_recur; + __entry->bli_refcount = atomic_read(&bip->bli_refcount); + __entry->buf_bno = bip->bli_buf->b_bn; + __entry->buf_len = BBTOB(bip->bli_buf->b_length); + __entry->buf_flags = bip->bli_buf->b_flags; + __entry->buf_hold = atomic_read(&bip->bli_buf->b_hold); + __entry->buf_pincount = atomic_read(&bip->bli_buf->b_pin_count); + __entry->buf_lockval = bip->bli_buf->b_sema.count; + __entry->li_desc = bip->bli_item.li_desc; + __entry->li_flags = bip->bli_item.li_flags; + ), + TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d " + "lock %d flags %s recur %d refcount %d bliflags %s " + "lidesc 0x%p liflags %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->buf_bno, + __entry->buf_len, + __entry->buf_hold, + __entry->buf_pincount, + __entry->buf_lockval, + __print_flags(__entry->buf_flags, "|", XFS_BUF_FLAGS), + __entry->bli_recur, + __entry->bli_refcount, + __print_flags(__entry->bli_flags, "|", XFS_BLI_FLAGS), + __entry->li_desc, + __print_flags(__entry->li_flags, "|", XFS_LI_FLAGS)) +) + +#define DEFINE_BUF_ITEM_EVENT(name) \ +DEFINE_EVENT(xfs_buf_item_class, name, \ + TP_PROTO(struct xfs_buf_log_item *bip), \ + TP_ARGS(bip)) +DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size); +DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_ordered); +DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_stale); +DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format); +DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_ordered); +DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_stale); +DEFINE_BUF_ITEM_EVENT(xfs_buf_item_ordered); +DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pin); +DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin); +DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin_stale); +DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock); +DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock_stale); +DEFINE_BUF_ITEM_EVENT(xfs_buf_item_committed); +DEFINE_BUF_ITEM_EVENT(xfs_buf_item_push); +DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf); +DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf_recur); +DEFINE_BUF_ITEM_EVENT(xfs_trans_getsb); +DEFINE_BUF_ITEM_EVENT(xfs_trans_getsb_recur); +DEFINE_BUF_ITEM_EVENT(xfs_trans_read_buf); +DEFINE_BUF_ITEM_EVENT(xfs_trans_read_buf_recur); +DEFINE_BUF_ITEM_EVENT(xfs_trans_log_buf); +DEFINE_BUF_ITEM_EVENT(xfs_trans_brelse); +DEFINE_BUF_ITEM_EVENT(xfs_trans_bjoin); +DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold); +DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold_release); +DEFINE_BUF_ITEM_EVENT(xfs_trans_binval); +DEFINE_BUF_ITEM_EVENT(xfs_trans_buf_ordered); + +DECLARE_EVENT_CLASS(xfs_filestream_class, + TP_PROTO(struct xfs_inode *ip, xfs_agnumber_t agno), + TP_ARGS(ip, agno), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_agnumber_t, agno) + __field(int, streams) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->agno = agno; + __entry->streams = xfs_filestream_peek_ag(ip->i_mount, agno); + ), + TP_printk("dev %d:%d ino 0x%llx agno %u streams %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->agno, + __entry->streams) +) +#define DEFINE_FILESTREAM_EVENT(name) \ +DEFINE_EVENT(xfs_filestream_class, name, \ + TP_PROTO(struct xfs_inode *ip, xfs_agnumber_t agno), \ + TP_ARGS(ip, agno)) +DEFINE_FILESTREAM_EVENT(xfs_filestream_free); +DEFINE_FILESTREAM_EVENT(xfs_filestream_lookup); +DEFINE_FILESTREAM_EVENT(xfs_filestream_scan); + +TRACE_EVENT(xfs_filestream_pick, + TP_PROTO(struct xfs_inode *ip, xfs_agnumber_t agno, + xfs_extlen_t free, int nscan), + TP_ARGS(ip, agno, free, nscan), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_agnumber_t, agno) + __field(int, streams) + __field(xfs_extlen_t, free) + __field(int, nscan) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->agno = agno; + __entry->streams = xfs_filestream_peek_ag(ip->i_mount, agno); + __entry->free = free; + __entry->nscan = nscan; + ), + TP_printk("dev %d:%d ino 0x%llx agno %u streams %d free %d nscan %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->agno, + __entry->streams, + __entry->free, + __entry->nscan) +); + +DECLARE_EVENT_CLASS(xfs_lock_class, + TP_PROTO(struct xfs_inode *ip, unsigned lock_flags, + unsigned long caller_ip), + TP_ARGS(ip, lock_flags, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(int, lock_flags) + __field(unsigned long, caller_ip) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->lock_flags = lock_flags; + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d ino 0x%llx flags %s caller %ps", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __print_flags(__entry->lock_flags, "|", XFS_LOCK_FLAGS), + (void *)__entry->caller_ip) +) + +#define DEFINE_LOCK_EVENT(name) \ +DEFINE_EVENT(xfs_lock_class, name, \ + TP_PROTO(struct xfs_inode *ip, unsigned lock_flags, \ + unsigned long caller_ip), \ + TP_ARGS(ip, lock_flags, caller_ip)) +DEFINE_LOCK_EVENT(xfs_ilock); +DEFINE_LOCK_EVENT(xfs_ilock_nowait); +DEFINE_LOCK_EVENT(xfs_ilock_demote); +DEFINE_LOCK_EVENT(xfs_iunlock); + +DECLARE_EVENT_CLASS(xfs_inode_class, + TP_PROTO(struct xfs_inode *ip), + TP_ARGS(ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + ), + TP_printk("dev %d:%d ino 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino) +) + +#define DEFINE_INODE_EVENT(name) \ +DEFINE_EVENT(xfs_inode_class, name, \ + TP_PROTO(struct xfs_inode *ip), \ + TP_ARGS(ip)) +DEFINE_INODE_EVENT(xfs_iget_skip); +DEFINE_INODE_EVENT(xfs_iget_reclaim); +DEFINE_INODE_EVENT(xfs_iget_reclaim_fail); +DEFINE_INODE_EVENT(xfs_iget_hit); +DEFINE_INODE_EVENT(xfs_iget_miss); + +DEFINE_INODE_EVENT(xfs_getattr); +DEFINE_INODE_EVENT(xfs_setattr); +DEFINE_INODE_EVENT(xfs_readlink); +DEFINE_INODE_EVENT(xfs_inactive_symlink); +DEFINE_INODE_EVENT(xfs_alloc_file_space); +DEFINE_INODE_EVENT(xfs_free_file_space); +DEFINE_INODE_EVENT(xfs_zero_file_space); +DEFINE_INODE_EVENT(xfs_collapse_file_space); +DEFINE_INODE_EVENT(xfs_insert_file_space); +DEFINE_INODE_EVENT(xfs_readdir); +#ifdef CONFIG_XFS_POSIX_ACL +DEFINE_INODE_EVENT(xfs_get_acl); +#endif +DEFINE_INODE_EVENT(xfs_vm_bmap); +DEFINE_INODE_EVENT(xfs_file_ioctl); +DEFINE_INODE_EVENT(xfs_file_compat_ioctl); +DEFINE_INODE_EVENT(xfs_ioctl_setattr); +DEFINE_INODE_EVENT(xfs_dir_fsync); +DEFINE_INODE_EVENT(xfs_file_fsync); +DEFINE_INODE_EVENT(xfs_destroy_inode); +DEFINE_INODE_EVENT(xfs_evict_inode); +DEFINE_INODE_EVENT(xfs_update_time); + +DEFINE_INODE_EVENT(xfs_dquot_dqalloc); +DEFINE_INODE_EVENT(xfs_dquot_dqdetach); + +DEFINE_INODE_EVENT(xfs_inode_set_eofblocks_tag); +DEFINE_INODE_EVENT(xfs_inode_clear_eofblocks_tag); +DEFINE_INODE_EVENT(xfs_inode_free_eofblocks_invalid); + +DEFINE_INODE_EVENT(xfs_filemap_fault); +DEFINE_INODE_EVENT(xfs_filemap_page_mkwrite); + +DECLARE_EVENT_CLASS(xfs_iref_class, + TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), + TP_ARGS(ip, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(int, count) + __field(int, pincount) + __field(unsigned long, caller_ip) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->count = atomic_read(&VFS_I(ip)->i_count); + __entry->pincount = atomic_read(&ip->i_pincount); + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d ino 0x%llx count %d pincount %d caller %ps", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->count, + __entry->pincount, + (char *)__entry->caller_ip) +) + +TRACE_EVENT(xfs_iomap_prealloc_size, + TP_PROTO(struct xfs_inode *ip, xfs_fsblock_t blocks, int shift, + unsigned int writeio_blocks), + TP_ARGS(ip, blocks, shift, writeio_blocks), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_fsblock_t, blocks) + __field(int, shift) + __field(unsigned int, writeio_blocks) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->blocks = blocks; + __entry->shift = shift; + __entry->writeio_blocks = writeio_blocks; + ), + TP_printk("dev %d:%d ino 0x%llx prealloc blocks %llu shift %d " + "m_writeio_blocks %u", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, + __entry->blocks, __entry->shift, __entry->writeio_blocks) +) + +#define DEFINE_IREF_EVENT(name) \ +DEFINE_EVENT(xfs_iref_class, name, \ + TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), \ + TP_ARGS(ip, caller_ip)) +DEFINE_IREF_EVENT(xfs_ihold); +DEFINE_IREF_EVENT(xfs_irele); +DEFINE_IREF_EVENT(xfs_inode_pin); +DEFINE_IREF_EVENT(xfs_inode_unpin); +DEFINE_IREF_EVENT(xfs_inode_unpin_nowait); + +DECLARE_EVENT_CLASS(xfs_namespace_class, + TP_PROTO(struct xfs_inode *dp, struct xfs_name *name), + TP_ARGS(dp, name), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, dp_ino) + __field(int, namelen) + __dynamic_array(char, name, name->len) + ), + TP_fast_assign( + __entry->dev = VFS_I(dp)->i_sb->s_dev; + __entry->dp_ino = dp->i_ino; + __entry->namelen = name->len; + memcpy(__get_str(name), name->name, name->len); + ), + TP_printk("dev %d:%d dp ino 0x%llx name %.*s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->dp_ino, + __entry->namelen, + __get_str(name)) +) + +#define DEFINE_NAMESPACE_EVENT(name) \ +DEFINE_EVENT(xfs_namespace_class, name, \ + TP_PROTO(struct xfs_inode *dp, struct xfs_name *name), \ + TP_ARGS(dp, name)) +DEFINE_NAMESPACE_EVENT(xfs_remove); +DEFINE_NAMESPACE_EVENT(xfs_link); +DEFINE_NAMESPACE_EVENT(xfs_lookup); +DEFINE_NAMESPACE_EVENT(xfs_create); +DEFINE_NAMESPACE_EVENT(xfs_symlink); + +TRACE_EVENT(xfs_rename, + TP_PROTO(struct xfs_inode *src_dp, struct xfs_inode *target_dp, + struct xfs_name *src_name, struct xfs_name *target_name), + TP_ARGS(src_dp, target_dp, src_name, target_name), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, src_dp_ino) + __field(xfs_ino_t, target_dp_ino) + __field(int, src_namelen) + __field(int, target_namelen) + __dynamic_array(char, src_name, src_name->len) + __dynamic_array(char, target_name, target_name->len) + ), + TP_fast_assign( + __entry->dev = VFS_I(src_dp)->i_sb->s_dev; + __entry->src_dp_ino = src_dp->i_ino; + __entry->target_dp_ino = target_dp->i_ino; + __entry->src_namelen = src_name->len; + __entry->target_namelen = target_name->len; + memcpy(__get_str(src_name), src_name->name, src_name->len); + memcpy(__get_str(target_name), target_name->name, + target_name->len); + ), + TP_printk("dev %d:%d src dp ino 0x%llx target dp ino 0x%llx" + " src name %.*s target name %.*s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->src_dp_ino, + __entry->target_dp_ino, + __entry->src_namelen, + __get_str(src_name), + __entry->target_namelen, + __get_str(target_name)) +) + +DECLARE_EVENT_CLASS(xfs_dquot_class, + TP_PROTO(struct xfs_dquot *dqp), + TP_ARGS(dqp), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u32, id) + __field(unsigned, flags) + __field(unsigned, nrefs) + __field(unsigned long long, res_bcount) + __field(unsigned long long, bcount) + __field(unsigned long long, icount) + __field(unsigned long long, blk_hardlimit) + __field(unsigned long long, blk_softlimit) + __field(unsigned long long, ino_hardlimit) + __field(unsigned long long, ino_softlimit) + ), \ + TP_fast_assign( + __entry->dev = dqp->q_mount->m_super->s_dev; + __entry->id = be32_to_cpu(dqp->q_core.d_id); + __entry->flags = dqp->dq_flags; + __entry->nrefs = dqp->q_nrefs; + __entry->res_bcount = dqp->q_res_bcount; + __entry->bcount = be64_to_cpu(dqp->q_core.d_bcount); + __entry->icount = be64_to_cpu(dqp->q_core.d_icount); + __entry->blk_hardlimit = + be64_to_cpu(dqp->q_core.d_blk_hardlimit); + __entry->blk_softlimit = + be64_to_cpu(dqp->q_core.d_blk_softlimit); + __entry->ino_hardlimit = + be64_to_cpu(dqp->q_core.d_ino_hardlimit); + __entry->ino_softlimit = + be64_to_cpu(dqp->q_core.d_ino_softlimit); + ), + TP_printk("dev %d:%d id 0x%x flags %s nrefs %u res_bc 0x%llx " + "bcnt 0x%llx bhardlimit 0x%llx bsoftlimit 0x%llx " + "icnt 0x%llx ihardlimit 0x%llx isoftlimit 0x%llx]", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->id, + __print_flags(__entry->flags, "|", XFS_DQ_FLAGS), + __entry->nrefs, + __entry->res_bcount, + __entry->bcount, + __entry->blk_hardlimit, + __entry->blk_softlimit, + __entry->icount, + __entry->ino_hardlimit, + __entry->ino_softlimit) +) + +#define DEFINE_DQUOT_EVENT(name) \ +DEFINE_EVENT(xfs_dquot_class, name, \ + TP_PROTO(struct xfs_dquot *dqp), \ + TP_ARGS(dqp)) +DEFINE_DQUOT_EVENT(xfs_dqadjust); +DEFINE_DQUOT_EVENT(xfs_dqreclaim_want); +DEFINE_DQUOT_EVENT(xfs_dqreclaim_dirty); +DEFINE_DQUOT_EVENT(xfs_dqreclaim_busy); +DEFINE_DQUOT_EVENT(xfs_dqreclaim_done); +DEFINE_DQUOT_EVENT(xfs_dqattach_found); +DEFINE_DQUOT_EVENT(xfs_dqattach_get); +DEFINE_DQUOT_EVENT(xfs_dqalloc); +DEFINE_DQUOT_EVENT(xfs_dqtobp_read); +DEFINE_DQUOT_EVENT(xfs_dqread); +DEFINE_DQUOT_EVENT(xfs_dqread_fail); +DEFINE_DQUOT_EVENT(xfs_dqget_hit); +DEFINE_DQUOT_EVENT(xfs_dqget_miss); +DEFINE_DQUOT_EVENT(xfs_dqget_freeing); +DEFINE_DQUOT_EVENT(xfs_dqget_dup); +DEFINE_DQUOT_EVENT(xfs_dqput); +DEFINE_DQUOT_EVENT(xfs_dqput_wait); +DEFINE_DQUOT_EVENT(xfs_dqput_free); +DEFINE_DQUOT_EVENT(xfs_dqrele); +DEFINE_DQUOT_EVENT(xfs_dqflush); +DEFINE_DQUOT_EVENT(xfs_dqflush_force); +DEFINE_DQUOT_EVENT(xfs_dqflush_done); + +DECLARE_EVENT_CLASS(xfs_loggrant_class, + TP_PROTO(struct xlog *log, struct xlog_ticket *tic), + TP_ARGS(log, tic), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned, trans_type) + __field(char, ocnt) + __field(char, cnt) + __field(int, curr_res) + __field(int, unit_res) + __field(unsigned int, flags) + __field(int, reserveq) + __field(int, writeq) + __field(int, grant_reserve_cycle) + __field(int, grant_reserve_bytes) + __field(int, grant_write_cycle) + __field(int, grant_write_bytes) + __field(int, curr_cycle) + __field(int, curr_block) + __field(xfs_lsn_t, tail_lsn) + ), + TP_fast_assign( + __entry->dev = log->l_mp->m_super->s_dev; + __entry->trans_type = tic->t_trans_type; + __entry->ocnt = tic->t_ocnt; + __entry->cnt = tic->t_cnt; + __entry->curr_res = tic->t_curr_res; + __entry->unit_res = tic->t_unit_res; + __entry->flags = tic->t_flags; + __entry->reserveq = list_empty(&log->l_reserve_head.waiters); + __entry->writeq = list_empty(&log->l_write_head.waiters); + xlog_crack_grant_head(&log->l_reserve_head.grant, + &__entry->grant_reserve_cycle, + &__entry->grant_reserve_bytes); + xlog_crack_grant_head(&log->l_write_head.grant, + &__entry->grant_write_cycle, + &__entry->grant_write_bytes); + __entry->curr_cycle = log->l_curr_cycle; + __entry->curr_block = log->l_curr_block; + __entry->tail_lsn = atomic64_read(&log->l_tail_lsn); + ), + TP_printk("dev %d:%d type %s t_ocnt %u t_cnt %u t_curr_res %u " + "t_unit_res %u t_flags %s reserveq %s " + "writeq %s grant_reserve_cycle %d " + "grant_reserve_bytes %d grant_write_cycle %d " + "grant_write_bytes %d curr_cycle %d curr_block %d " + "tail_cycle %d tail_block %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->trans_type, XFS_TRANS_TYPES), + __entry->ocnt, + __entry->cnt, + __entry->curr_res, + __entry->unit_res, + __print_flags(__entry->flags, "|", XLOG_TIC_FLAGS), + __entry->reserveq ? "empty" : "active", + __entry->writeq ? "empty" : "active", + __entry->grant_reserve_cycle, + __entry->grant_reserve_bytes, + __entry->grant_write_cycle, + __entry->grant_write_bytes, + __entry->curr_cycle, + __entry->curr_block, + CYCLE_LSN(__entry->tail_lsn), + BLOCK_LSN(__entry->tail_lsn) + ) +) + +#define DEFINE_LOGGRANT_EVENT(name) \ +DEFINE_EVENT(xfs_loggrant_class, name, \ + TP_PROTO(struct xlog *log, struct xlog_ticket *tic), \ + TP_ARGS(log, tic)) +DEFINE_LOGGRANT_EVENT(xfs_log_done_nonperm); +DEFINE_LOGGRANT_EVENT(xfs_log_done_perm); +DEFINE_LOGGRANT_EVENT(xfs_log_umount_write); +DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep); +DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake); +DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake_up); +DEFINE_LOGGRANT_EVENT(xfs_log_reserve); +DEFINE_LOGGRANT_EVENT(xfs_log_reserve_exit); +DEFINE_LOGGRANT_EVENT(xfs_log_regrant); +DEFINE_LOGGRANT_EVENT(xfs_log_regrant_exit); +DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_enter); +DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_exit); +DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_sub); +DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_enter); +DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_exit); +DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_sub); + +DECLARE_EVENT_CLASS(xfs_log_item_class, + TP_PROTO(struct xfs_log_item *lip), + TP_ARGS(lip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(void *, lip) + __field(uint, type) + __field(uint, flags) + __field(xfs_lsn_t, lsn) + ), + TP_fast_assign( + __entry->dev = lip->li_mountp->m_super->s_dev; + __entry->lip = lip; + __entry->type = lip->li_type; + __entry->flags = lip->li_flags; + __entry->lsn = lip->li_lsn; + ), + TP_printk("dev %d:%d lip 0x%p lsn %d/%d type %s flags %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->lip, + CYCLE_LSN(__entry->lsn), BLOCK_LSN(__entry->lsn), + __print_symbolic(__entry->type, XFS_LI_TYPE_DESC), + __print_flags(__entry->flags, "|", XFS_LI_FLAGS)) +) + +TRACE_EVENT(xfs_log_force, + TP_PROTO(struct xfs_mount *mp, xfs_lsn_t lsn), + TP_ARGS(mp, lsn), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_lsn_t, lsn) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->lsn = lsn; + ), + TP_printk("dev %d:%d lsn 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->lsn) +) + +#define DEFINE_LOG_ITEM_EVENT(name) \ +DEFINE_EVENT(xfs_log_item_class, name, \ + TP_PROTO(struct xfs_log_item *lip), \ + TP_ARGS(lip)) +DEFINE_LOG_ITEM_EVENT(xfs_ail_push); +DEFINE_LOG_ITEM_EVENT(xfs_ail_pinned); +DEFINE_LOG_ITEM_EVENT(xfs_ail_locked); +DEFINE_LOG_ITEM_EVENT(xfs_ail_flushing); + +DECLARE_EVENT_CLASS(xfs_ail_class, + TP_PROTO(struct xfs_log_item *lip, xfs_lsn_t old_lsn, xfs_lsn_t new_lsn), + TP_ARGS(lip, old_lsn, new_lsn), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(void *, lip) + __field(uint, type) + __field(uint, flags) + __field(xfs_lsn_t, old_lsn) + __field(xfs_lsn_t, new_lsn) + ), + TP_fast_assign( + __entry->dev = lip->li_mountp->m_super->s_dev; + __entry->lip = lip; + __entry->type = lip->li_type; + __entry->flags = lip->li_flags; + __entry->old_lsn = old_lsn; + __entry->new_lsn = new_lsn; + ), + TP_printk("dev %d:%d lip 0x%p old lsn %d/%d new lsn %d/%d type %s flags %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->lip, + CYCLE_LSN(__entry->old_lsn), BLOCK_LSN(__entry->old_lsn), + CYCLE_LSN(__entry->new_lsn), BLOCK_LSN(__entry->new_lsn), + __print_symbolic(__entry->type, XFS_LI_TYPE_DESC), + __print_flags(__entry->flags, "|", XFS_LI_FLAGS)) +) + +#define DEFINE_AIL_EVENT(name) \ +DEFINE_EVENT(xfs_ail_class, name, \ + TP_PROTO(struct xfs_log_item *lip, xfs_lsn_t old_lsn, xfs_lsn_t new_lsn), \ + TP_ARGS(lip, old_lsn, new_lsn)) +DEFINE_AIL_EVENT(xfs_ail_insert); +DEFINE_AIL_EVENT(xfs_ail_move); +DEFINE_AIL_EVENT(xfs_ail_delete); + +TRACE_EVENT(xfs_log_assign_tail_lsn, + TP_PROTO(struct xlog *log, xfs_lsn_t new_lsn), + TP_ARGS(log, new_lsn), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_lsn_t, new_lsn) + __field(xfs_lsn_t, old_lsn) + __field(xfs_lsn_t, last_sync_lsn) + ), + TP_fast_assign( + __entry->dev = log->l_mp->m_super->s_dev; + __entry->new_lsn = new_lsn; + __entry->old_lsn = atomic64_read(&log->l_tail_lsn); + __entry->last_sync_lsn = atomic64_read(&log->l_last_sync_lsn); + ), + TP_printk("dev %d:%d new tail lsn %d/%d, old lsn %d/%d, last sync %d/%d", + MAJOR(__entry->dev), MINOR(__entry->dev), + CYCLE_LSN(__entry->new_lsn), BLOCK_LSN(__entry->new_lsn), + CYCLE_LSN(__entry->old_lsn), BLOCK_LSN(__entry->old_lsn), + CYCLE_LSN(__entry->last_sync_lsn), BLOCK_LSN(__entry->last_sync_lsn)) +) + +DECLARE_EVENT_CLASS(xfs_file_class, + TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags), + TP_ARGS(ip, count, offset, flags), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_fsize_t, size) + __field(loff_t, offset) + __field(size_t, count) + __field(int, flags) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->size = ip->i_d.di_size; + __entry->offset = offset; + __entry->count = count; + __entry->flags = flags; + ), + TP_printk("dev %d:%d ino 0x%llx size 0x%llx " + "offset 0x%llx count 0x%zx ioflags %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->size, + __entry->offset, + __entry->count, + __print_flags(__entry->flags, "|", XFS_IO_FLAGS)) +) + +#define DEFINE_RW_EVENT(name) \ +DEFINE_EVENT(xfs_file_class, name, \ + TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags), \ + TP_ARGS(ip, count, offset, flags)) +DEFINE_RW_EVENT(xfs_file_read); +DEFINE_RW_EVENT(xfs_file_buffered_write); +DEFINE_RW_EVENT(xfs_file_direct_write); +DEFINE_RW_EVENT(xfs_file_splice_read); + +DECLARE_EVENT_CLASS(xfs_page_class, + TP_PROTO(struct inode *inode, struct page *page, unsigned long off, + unsigned int len), + TP_ARGS(inode, page, off, len), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(pgoff_t, pgoff) + __field(loff_t, size) + __field(unsigned long, offset) + __field(unsigned int, length) + __field(int, delalloc) + __field(int, unwritten) + ), + TP_fast_assign( + int delalloc = -1, unwritten = -1; + + if (page_has_buffers(page)) + xfs_count_page_state(page, &delalloc, &unwritten); + __entry->dev = inode->i_sb->s_dev; + __entry->ino = XFS_I(inode)->i_ino; + __entry->pgoff = page_offset(page); + __entry->size = i_size_read(inode); + __entry->offset = off; + __entry->length = len; + __entry->delalloc = delalloc; + __entry->unwritten = unwritten; + ), + TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx " + "length %x delalloc %d unwritten %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->pgoff, + __entry->size, + __entry->offset, + __entry->length, + __entry->delalloc, + __entry->unwritten) +) + +#define DEFINE_PAGE_EVENT(name) \ +DEFINE_EVENT(xfs_page_class, name, \ + TP_PROTO(struct inode *inode, struct page *page, unsigned long off, \ + unsigned int len), \ + TP_ARGS(inode, page, off, len)) +DEFINE_PAGE_EVENT(xfs_writepage); +DEFINE_PAGE_EVENT(xfs_releasepage); +DEFINE_PAGE_EVENT(xfs_invalidatepage); + +DECLARE_EVENT_CLASS(xfs_imap_class, + TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, + int type, struct xfs_bmbt_irec *irec), + TP_ARGS(ip, offset, count, type, irec), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(loff_t, size) + __field(loff_t, offset) + __field(size_t, count) + __field(int, type) + __field(xfs_fileoff_t, startoff) + __field(xfs_fsblock_t, startblock) + __field(xfs_filblks_t, blockcount) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->size = ip->i_d.di_size; + __entry->offset = offset; + __entry->count = count; + __entry->type = type; + __entry->startoff = irec ? irec->br_startoff : 0; + __entry->startblock = irec ? irec->br_startblock : 0; + __entry->blockcount = irec ? irec->br_blockcount : 0; + ), + TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx count %zd " + "type %s startoff 0x%llx startblock %lld blockcount 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->size, + __entry->offset, + __entry->count, + __print_symbolic(__entry->type, XFS_IO_TYPES), + __entry->startoff, + (__int64_t)__entry->startblock, + __entry->blockcount) +) + +#define DEFINE_IOMAP_EVENT(name) \ +DEFINE_EVENT(xfs_imap_class, name, \ + TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, \ + int type, struct xfs_bmbt_irec *irec), \ + TP_ARGS(ip, offset, count, type, irec)) +DEFINE_IOMAP_EVENT(xfs_map_blocks_found); +DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc); +DEFINE_IOMAP_EVENT(xfs_get_blocks_found); +DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc); +DEFINE_IOMAP_EVENT(xfs_gbmap_direct); +DEFINE_IOMAP_EVENT(xfs_gbmap_direct_new); +DEFINE_IOMAP_EVENT(xfs_gbmap_direct_update); +DEFINE_IOMAP_EVENT(xfs_gbmap_direct_none); +DEFINE_IOMAP_EVENT(xfs_gbmap_direct_endio); + +DECLARE_EVENT_CLASS(xfs_simple_io_class, + TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count), + TP_ARGS(ip, offset, count), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(loff_t, isize) + __field(loff_t, disize) + __field(loff_t, offset) + __field(size_t, count) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->isize = VFS_I(ip)->i_size; + __entry->disize = ip->i_d.di_size; + __entry->offset = offset; + __entry->count = count; + ), + TP_printk("dev %d:%d ino 0x%llx isize 0x%llx disize 0x%llx " + "offset 0x%llx count %zd", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->isize, + __entry->disize, + __entry->offset, + __entry->count) +); + +#define DEFINE_SIMPLE_IO_EVENT(name) \ +DEFINE_EVENT(xfs_simple_io_class, name, \ + TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count), \ + TP_ARGS(ip, offset, count)) +DEFINE_SIMPLE_IO_EVENT(xfs_delalloc_enospc); +DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert); +DEFINE_SIMPLE_IO_EVENT(xfs_get_blocks_notfound); +DEFINE_SIMPLE_IO_EVENT(xfs_setfilesize); + +DECLARE_EVENT_CLASS(xfs_itrunc_class, + TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size), + TP_ARGS(ip, new_size), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_fsize_t, size) + __field(xfs_fsize_t, new_size) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->size = ip->i_d.di_size; + __entry->new_size = new_size; + ), + TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->size, + __entry->new_size) +) + +#define DEFINE_ITRUNC_EVENT(name) \ +DEFINE_EVENT(xfs_itrunc_class, name, \ + TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size), \ + TP_ARGS(ip, new_size)) +DEFINE_ITRUNC_EVENT(xfs_itruncate_extents_start); +DEFINE_ITRUNC_EVENT(xfs_itruncate_extents_end); + +TRACE_EVENT(xfs_pagecache_inval, + TP_PROTO(struct xfs_inode *ip, xfs_off_t start, xfs_off_t finish), + TP_ARGS(ip, start, finish), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_fsize_t, size) + __field(xfs_off_t, start) + __field(xfs_off_t, finish) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->size = ip->i_d.di_size; + __entry->start = start; + __entry->finish = finish; + ), + TP_printk("dev %d:%d ino 0x%llx size 0x%llx start 0x%llx finish 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->size, + __entry->start, + __entry->finish) +); + +TRACE_EVENT(xfs_bunmap, + TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, + int flags, unsigned long caller_ip), + TP_ARGS(ip, bno, len, flags, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_fsize_t, size) + __field(xfs_fileoff_t, bno) + __field(xfs_filblks_t, len) + __field(unsigned long, caller_ip) + __field(int, flags) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->size = ip->i_d.di_size; + __entry->bno = bno; + __entry->len = len; + __entry->caller_ip = caller_ip; + __entry->flags = flags; + ), + TP_printk("dev %d:%d ino 0x%llx size 0x%llx bno 0x%llx len 0x%llx" + "flags %s caller %ps", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->size, + __entry->bno, + __entry->len, + __print_flags(__entry->flags, "|", XFS_BMAPI_FLAGS), + (void *)__entry->caller_ip) + +); + +DECLARE_EVENT_CLASS(xfs_extent_busy_class, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t agbno, xfs_extlen_t len), + TP_ARGS(mp, agno, agbno, len), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, len) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->agbno = agbno; + __entry->len = len; + ), + TP_printk("dev %d:%d agno %u agbno %u len %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agbno, + __entry->len) +); +#define DEFINE_BUSY_EVENT(name) \ +DEFINE_EVENT(xfs_extent_busy_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ + xfs_agblock_t agbno, xfs_extlen_t len), \ + TP_ARGS(mp, agno, agbno, len)) +DEFINE_BUSY_EVENT(xfs_extent_busy); +DEFINE_BUSY_EVENT(xfs_extent_busy_enomem); +DEFINE_BUSY_EVENT(xfs_extent_busy_force); +DEFINE_BUSY_EVENT(xfs_extent_busy_reuse); +DEFINE_BUSY_EVENT(xfs_extent_busy_clear); + +TRACE_EVENT(xfs_extent_busy_trim, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t agbno, xfs_extlen_t len, + xfs_agblock_t tbno, xfs_extlen_t tlen), + TP_ARGS(mp, agno, agbno, len, tbno, tlen), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, len) + __field(xfs_agblock_t, tbno) + __field(xfs_extlen_t, tlen) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->agbno = agbno; + __entry->len = len; + __entry->tbno = tbno; + __entry->tlen = tlen; + ), + TP_printk("dev %d:%d agno %u agbno %u len %u tbno %u tlen %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agbno, + __entry->len, + __entry->tbno, + __entry->tlen) +); + +TRACE_EVENT(xfs_trans_commit_lsn, + TP_PROTO(struct xfs_trans *trans), + TP_ARGS(trans), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(struct xfs_trans *, tp) + __field(xfs_lsn_t, lsn) + ), + TP_fast_assign( + __entry->dev = trans->t_mountp->m_super->s_dev; + __entry->tp = trans; + __entry->lsn = trans->t_commit_lsn; + ), + TP_printk("dev %d:%d trans 0x%p commit_lsn 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->tp, + __entry->lsn) +); + +TRACE_EVENT(xfs_agf, + TP_PROTO(struct xfs_mount *mp, struct xfs_agf *agf, int flags, + unsigned long caller_ip), + TP_ARGS(mp, agf, flags, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(int, flags) + __field(__u32, length) + __field(__u32, bno_root) + __field(__u32, cnt_root) + __field(__u32, bno_level) + __field(__u32, cnt_level) + __field(__u32, flfirst) + __field(__u32, fllast) + __field(__u32, flcount) + __field(__u32, freeblks) + __field(__u32, longest) + __field(unsigned long, caller_ip) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = be32_to_cpu(agf->agf_seqno), + __entry->flags = flags; + __entry->length = be32_to_cpu(agf->agf_length), + __entry->bno_root = be32_to_cpu(agf->agf_roots[XFS_BTNUM_BNO]), + __entry->cnt_root = be32_to_cpu(agf->agf_roots[XFS_BTNUM_CNT]), + __entry->bno_level = + be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]), + __entry->cnt_level = + be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]), + __entry->flfirst = be32_to_cpu(agf->agf_flfirst), + __entry->fllast = be32_to_cpu(agf->agf_fllast), + __entry->flcount = be32_to_cpu(agf->agf_flcount), + __entry->freeblks = be32_to_cpu(agf->agf_freeblks), + __entry->longest = be32_to_cpu(agf->agf_longest); + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d agno %u flags %s length %u roots b %u c %u " + "levels b %u c %u flfirst %u fllast %u flcount %u " + "freeblks %u longest %u caller %ps", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __print_flags(__entry->flags, "|", XFS_AGF_FLAGS), + __entry->length, + __entry->bno_root, + __entry->cnt_root, + __entry->bno_level, + __entry->cnt_level, + __entry->flfirst, + __entry->fllast, + __entry->flcount, + __entry->freeblks, + __entry->longest, + (void *)__entry->caller_ip) +); + +TRACE_EVENT(xfs_free_extent, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, + xfs_extlen_t len, bool isfl, int haveleft, int haveright), + TP_ARGS(mp, agno, agbno, len, isfl, haveleft, haveright), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, len) + __field(int, isfl) + __field(int, haveleft) + __field(int, haveright) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->agbno = agbno; + __entry->len = len; + __entry->isfl = isfl; + __entry->haveleft = haveleft; + __entry->haveright = haveright; + ), + TP_printk("dev %d:%d agno %u agbno %u len %u isfl %d %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agbno, + __entry->len, + __entry->isfl, + __entry->haveleft ? + (__entry->haveright ? "both" : "left") : + (__entry->haveright ? "right" : "none")) + +); + +DECLARE_EVENT_CLASS(xfs_alloc_class, + TP_PROTO(struct xfs_alloc_arg *args), + TP_ARGS(args), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, minlen) + __field(xfs_extlen_t, maxlen) + __field(xfs_extlen_t, mod) + __field(xfs_extlen_t, prod) + __field(xfs_extlen_t, minleft) + __field(xfs_extlen_t, total) + __field(xfs_extlen_t, alignment) + __field(xfs_extlen_t, minalignslop) + __field(xfs_extlen_t, len) + __field(short, type) + __field(short, otype) + __field(char, wasdel) + __field(char, wasfromfl) + __field(char, isfl) + __field(char, userdata) + __field(xfs_fsblock_t, firstblock) + ), + TP_fast_assign( + __entry->dev = args->mp->m_super->s_dev; + __entry->agno = args->agno; + __entry->agbno = args->agbno; + __entry->minlen = args->minlen; + __entry->maxlen = args->maxlen; + __entry->mod = args->mod; + __entry->prod = args->prod; + __entry->minleft = args->minleft; + __entry->total = args->total; + __entry->alignment = args->alignment; + __entry->minalignslop = args->minalignslop; + __entry->len = args->len; + __entry->type = args->type; + __entry->otype = args->otype; + __entry->wasdel = args->wasdel; + __entry->wasfromfl = args->wasfromfl; + __entry->isfl = args->isfl; + __entry->userdata = args->userdata; + __entry->firstblock = args->firstblock; + ), + TP_printk("dev %d:%d agno %u agbno %u minlen %u maxlen %u mod %u " + "prod %u minleft %u total %u alignment %u minalignslop %u " + "len %u type %s otype %s wasdel %d wasfromfl %d isfl %d " + "userdata %d firstblock 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agbno, + __entry->minlen, + __entry->maxlen, + __entry->mod, + __entry->prod, + __entry->minleft, + __entry->total, + __entry->alignment, + __entry->minalignslop, + __entry->len, + __print_symbolic(__entry->type, XFS_ALLOC_TYPES), + __print_symbolic(__entry->otype, XFS_ALLOC_TYPES), + __entry->wasdel, + __entry->wasfromfl, + __entry->isfl, + __entry->userdata, + (unsigned long long)__entry->firstblock) +) + +#define DEFINE_ALLOC_EVENT(name) \ +DEFINE_EVENT(xfs_alloc_class, name, \ + TP_PROTO(struct xfs_alloc_arg *args), \ + TP_ARGS(args)) +DEFINE_ALLOC_EVENT(xfs_alloc_exact_done); +DEFINE_ALLOC_EVENT(xfs_alloc_exact_notfound); +DEFINE_ALLOC_EVENT(xfs_alloc_exact_error); +DEFINE_ALLOC_EVENT(xfs_alloc_near_nominleft); +DEFINE_ALLOC_EVENT(xfs_alloc_near_first); +DEFINE_ALLOC_EVENT(xfs_alloc_near_greater); +DEFINE_ALLOC_EVENT(xfs_alloc_near_lesser); +DEFINE_ALLOC_EVENT(xfs_alloc_near_error); +DEFINE_ALLOC_EVENT(xfs_alloc_near_noentry); +DEFINE_ALLOC_EVENT(xfs_alloc_near_busy); +DEFINE_ALLOC_EVENT(xfs_alloc_size_neither); +DEFINE_ALLOC_EVENT(xfs_alloc_size_noentry); +DEFINE_ALLOC_EVENT(xfs_alloc_size_nominleft); +DEFINE_ALLOC_EVENT(xfs_alloc_size_done); +DEFINE_ALLOC_EVENT(xfs_alloc_size_error); +DEFINE_ALLOC_EVENT(xfs_alloc_size_busy); +DEFINE_ALLOC_EVENT(xfs_alloc_small_freelist); +DEFINE_ALLOC_EVENT(xfs_alloc_small_notenough); +DEFINE_ALLOC_EVENT(xfs_alloc_small_done); +DEFINE_ALLOC_EVENT(xfs_alloc_small_error); +DEFINE_ALLOC_EVENT(xfs_alloc_vextent_badargs); +DEFINE_ALLOC_EVENT(xfs_alloc_vextent_nofix); +DEFINE_ALLOC_EVENT(xfs_alloc_vextent_noagbp); +DEFINE_ALLOC_EVENT(xfs_alloc_vextent_loopfailed); +DEFINE_ALLOC_EVENT(xfs_alloc_vextent_allfailed); + +DECLARE_EVENT_CLASS(xfs_da_class, + TP_PROTO(struct xfs_da_args *args), + TP_ARGS(args), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __dynamic_array(char, name, args->namelen) + __field(int, namelen) + __field(xfs_dahash_t, hashval) + __field(xfs_ino_t, inumber) + __field(int, op_flags) + ), + TP_fast_assign( + __entry->dev = VFS_I(args->dp)->i_sb->s_dev; + __entry->ino = args->dp->i_ino; + if (args->namelen) + memcpy(__get_str(name), args->name, args->namelen); + __entry->namelen = args->namelen; + __entry->hashval = args->hashval; + __entry->inumber = args->inumber; + __entry->op_flags = args->op_flags; + ), + TP_printk("dev %d:%d ino 0x%llx name %.*s namelen %d hashval 0x%x " + "inumber 0x%llx op_flags %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->namelen, + __entry->namelen ? __get_str(name) : NULL, + __entry->namelen, + __entry->hashval, + __entry->inumber, + __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS)) +) + +#define DEFINE_DIR2_EVENT(name) \ +DEFINE_EVENT(xfs_da_class, name, \ + TP_PROTO(struct xfs_da_args *args), \ + TP_ARGS(args)) +DEFINE_DIR2_EVENT(xfs_dir2_sf_addname); +DEFINE_DIR2_EVENT(xfs_dir2_sf_create); +DEFINE_DIR2_EVENT(xfs_dir2_sf_lookup); +DEFINE_DIR2_EVENT(xfs_dir2_sf_replace); +DEFINE_DIR2_EVENT(xfs_dir2_sf_removename); +DEFINE_DIR2_EVENT(xfs_dir2_sf_toino4); +DEFINE_DIR2_EVENT(xfs_dir2_sf_toino8); +DEFINE_DIR2_EVENT(xfs_dir2_sf_to_block); +DEFINE_DIR2_EVENT(xfs_dir2_block_addname); +DEFINE_DIR2_EVENT(xfs_dir2_block_lookup); +DEFINE_DIR2_EVENT(xfs_dir2_block_replace); +DEFINE_DIR2_EVENT(xfs_dir2_block_removename); +DEFINE_DIR2_EVENT(xfs_dir2_block_to_sf); +DEFINE_DIR2_EVENT(xfs_dir2_block_to_leaf); +DEFINE_DIR2_EVENT(xfs_dir2_leaf_addname); +DEFINE_DIR2_EVENT(xfs_dir2_leaf_lookup); +DEFINE_DIR2_EVENT(xfs_dir2_leaf_replace); +DEFINE_DIR2_EVENT(xfs_dir2_leaf_removename); +DEFINE_DIR2_EVENT(xfs_dir2_leaf_to_block); +DEFINE_DIR2_EVENT(xfs_dir2_leaf_to_node); +DEFINE_DIR2_EVENT(xfs_dir2_node_addname); +DEFINE_DIR2_EVENT(xfs_dir2_node_lookup); +DEFINE_DIR2_EVENT(xfs_dir2_node_replace); +DEFINE_DIR2_EVENT(xfs_dir2_node_removename); +DEFINE_DIR2_EVENT(xfs_dir2_node_to_leaf); + +DECLARE_EVENT_CLASS(xfs_attr_class, + TP_PROTO(struct xfs_da_args *args), + TP_ARGS(args), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __dynamic_array(char, name, args->namelen) + __field(int, namelen) + __field(int, valuelen) + __field(xfs_dahash_t, hashval) + __field(int, op_flags) + ), + TP_fast_assign( + __entry->dev = VFS_I(args->dp)->i_sb->s_dev; + __entry->ino = args->dp->i_ino; + if (args->namelen) + memcpy(__get_str(name), args->name, args->namelen); + __entry->namelen = args->namelen; + __entry->valuelen = args->valuelen; + __entry->hashval = args->hashval; + __entry->op_flags = args->op_flags; + ), + TP_printk("dev %d:%d ino 0x%llx name %.*s namelen %d valuelen %d " + "hashval 0x%x op_flags %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->namelen, + __entry->namelen ? __get_str(name) : NULL, + __entry->namelen, + __entry->valuelen, + __entry->hashval, + __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS)) +) + +#define DEFINE_ATTR_EVENT(name) \ +DEFINE_EVENT(xfs_attr_class, name, \ + TP_PROTO(struct xfs_da_args *args), \ + TP_ARGS(args)) +DEFINE_ATTR_EVENT(xfs_attr_sf_add); +DEFINE_ATTR_EVENT(xfs_attr_sf_addname); +DEFINE_ATTR_EVENT(xfs_attr_sf_create); +DEFINE_ATTR_EVENT(xfs_attr_sf_lookup); +DEFINE_ATTR_EVENT(xfs_attr_sf_remove); +DEFINE_ATTR_EVENT(xfs_attr_sf_removename); +DEFINE_ATTR_EVENT(xfs_attr_sf_to_leaf); + +DEFINE_ATTR_EVENT(xfs_attr_leaf_add); +DEFINE_ATTR_EVENT(xfs_attr_leaf_add_old); +DEFINE_ATTR_EVENT(xfs_attr_leaf_add_new); +DEFINE_ATTR_EVENT(xfs_attr_leaf_add_work); +DEFINE_ATTR_EVENT(xfs_attr_leaf_addname); +DEFINE_ATTR_EVENT(xfs_attr_leaf_create); +DEFINE_ATTR_EVENT(xfs_attr_leaf_compact); +DEFINE_ATTR_EVENT(xfs_attr_leaf_get); +DEFINE_ATTR_EVENT(xfs_attr_leaf_lookup); +DEFINE_ATTR_EVENT(xfs_attr_leaf_replace); +DEFINE_ATTR_EVENT(xfs_attr_leaf_remove); +DEFINE_ATTR_EVENT(xfs_attr_leaf_removename); +DEFINE_ATTR_EVENT(xfs_attr_leaf_split); +DEFINE_ATTR_EVENT(xfs_attr_leaf_split_before); +DEFINE_ATTR_EVENT(xfs_attr_leaf_split_after); +DEFINE_ATTR_EVENT(xfs_attr_leaf_clearflag); +DEFINE_ATTR_EVENT(xfs_attr_leaf_setflag); +DEFINE_ATTR_EVENT(xfs_attr_leaf_flipflags); +DEFINE_ATTR_EVENT(xfs_attr_leaf_to_sf); +DEFINE_ATTR_EVENT(xfs_attr_leaf_to_node); +DEFINE_ATTR_EVENT(xfs_attr_leaf_rebalance); +DEFINE_ATTR_EVENT(xfs_attr_leaf_unbalance); +DEFINE_ATTR_EVENT(xfs_attr_leaf_toosmall); + +DEFINE_ATTR_EVENT(xfs_attr_node_addname); +DEFINE_ATTR_EVENT(xfs_attr_node_get); +DEFINE_ATTR_EVENT(xfs_attr_node_lookup); +DEFINE_ATTR_EVENT(xfs_attr_node_replace); +DEFINE_ATTR_EVENT(xfs_attr_node_removename); + +DEFINE_ATTR_EVENT(xfs_attr_fillstate); +DEFINE_ATTR_EVENT(xfs_attr_refillstate); + +DEFINE_ATTR_EVENT(xfs_attr_rmtval_get); +DEFINE_ATTR_EVENT(xfs_attr_rmtval_set); +DEFINE_ATTR_EVENT(xfs_attr_rmtval_remove); + +#define DEFINE_DA_EVENT(name) \ +DEFINE_EVENT(xfs_da_class, name, \ + TP_PROTO(struct xfs_da_args *args), \ + TP_ARGS(args)) +DEFINE_DA_EVENT(xfs_da_split); +DEFINE_DA_EVENT(xfs_da_join); +DEFINE_DA_EVENT(xfs_da_link_before); +DEFINE_DA_EVENT(xfs_da_link_after); +DEFINE_DA_EVENT(xfs_da_unlink_back); +DEFINE_DA_EVENT(xfs_da_unlink_forward); +DEFINE_DA_EVENT(xfs_da_root_split); +DEFINE_DA_EVENT(xfs_da_root_join); +DEFINE_DA_EVENT(xfs_da_node_add); +DEFINE_DA_EVENT(xfs_da_node_create); +DEFINE_DA_EVENT(xfs_da_node_split); +DEFINE_DA_EVENT(xfs_da_node_remove); +DEFINE_DA_EVENT(xfs_da_node_rebalance); +DEFINE_DA_EVENT(xfs_da_node_unbalance); +DEFINE_DA_EVENT(xfs_da_node_toosmall); +DEFINE_DA_EVENT(xfs_da_swap_lastblock); +DEFINE_DA_EVENT(xfs_da_grow_inode); +DEFINE_DA_EVENT(xfs_da_shrink_inode); +DEFINE_DA_EVENT(xfs_da_fixhashpath); +DEFINE_DA_EVENT(xfs_da_path_shift); + +DECLARE_EVENT_CLASS(xfs_dir2_space_class, + TP_PROTO(struct xfs_da_args *args, int idx), + TP_ARGS(args, idx), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(int, op_flags) + __field(int, idx) + ), + TP_fast_assign( + __entry->dev = VFS_I(args->dp)->i_sb->s_dev; + __entry->ino = args->dp->i_ino; + __entry->op_flags = args->op_flags; + __entry->idx = idx; + ), + TP_printk("dev %d:%d ino 0x%llx op_flags %s index %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS), + __entry->idx) +) + +#define DEFINE_DIR2_SPACE_EVENT(name) \ +DEFINE_EVENT(xfs_dir2_space_class, name, \ + TP_PROTO(struct xfs_da_args *args, int idx), \ + TP_ARGS(args, idx)) +DEFINE_DIR2_SPACE_EVENT(xfs_dir2_leafn_add); +DEFINE_DIR2_SPACE_EVENT(xfs_dir2_leafn_remove); +DEFINE_DIR2_SPACE_EVENT(xfs_dir2_grow_inode); +DEFINE_DIR2_SPACE_EVENT(xfs_dir2_shrink_inode); + +TRACE_EVENT(xfs_dir2_leafn_moveents, + TP_PROTO(struct xfs_da_args *args, int src_idx, int dst_idx, int count), + TP_ARGS(args, src_idx, dst_idx, count), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(int, op_flags) + __field(int, src_idx) + __field(int, dst_idx) + __field(int, count) + ), + TP_fast_assign( + __entry->dev = VFS_I(args->dp)->i_sb->s_dev; + __entry->ino = args->dp->i_ino; + __entry->op_flags = args->op_flags; + __entry->src_idx = src_idx; + __entry->dst_idx = dst_idx; + __entry->count = count; + ), + TP_printk("dev %d:%d ino 0x%llx op_flags %s " + "src_idx %d dst_idx %d count %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS), + __entry->src_idx, + __entry->dst_idx, + __entry->count) +); + +#define XFS_SWAPEXT_INODES \ + { 0, "target" }, \ + { 1, "temp" } + +#define XFS_INODE_FORMAT_STR \ + { 0, "invalid" }, \ + { 1, "local" }, \ + { 2, "extent" }, \ + { 3, "btree" } + +DECLARE_EVENT_CLASS(xfs_swap_extent_class, + TP_PROTO(struct xfs_inode *ip, int which), + TP_ARGS(ip, which), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(int, which) + __field(xfs_ino_t, ino) + __field(int, format) + __field(int, nex) + __field(int, broot_size) + __field(int, fork_off) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->which = which; + __entry->ino = ip->i_ino; + __entry->format = ip->i_d.di_format; + __entry->nex = ip->i_d.di_nextents; + __entry->broot_size = ip->i_df.if_broot_bytes; + __entry->fork_off = XFS_IFORK_BOFF(ip); + ), + TP_printk("dev %d:%d ino 0x%llx (%s), %s format, num_extents %d, " + "broot size %d, fork offset %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __print_symbolic(__entry->which, XFS_SWAPEXT_INODES), + __print_symbolic(__entry->format, XFS_INODE_FORMAT_STR), + __entry->nex, + __entry->broot_size, + __entry->fork_off) +) + +#define DEFINE_SWAPEXT_EVENT(name) \ +DEFINE_EVENT(xfs_swap_extent_class, name, \ + TP_PROTO(struct xfs_inode *ip, int which), \ + TP_ARGS(ip, which)) + +DEFINE_SWAPEXT_EVENT(xfs_swap_extent_before); +DEFINE_SWAPEXT_EVENT(xfs_swap_extent_after); + +DECLARE_EVENT_CLASS(xfs_log_recover_item_class, + TP_PROTO(struct xlog *log, struct xlog_recover *trans, + struct xlog_recover_item *item, int pass), + TP_ARGS(log, trans, item, pass), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned long, item) + __field(xlog_tid_t, tid) + __field(int, type) + __field(int, pass) + __field(int, count) + __field(int, total) + ), + TP_fast_assign( + __entry->dev = log->l_mp->m_super->s_dev; + __entry->item = (unsigned long)item; + __entry->tid = trans->r_log_tid; + __entry->type = ITEM_TYPE(item); + __entry->pass = pass; + __entry->count = item->ri_cnt; + __entry->total = item->ri_total; + ), + TP_printk("dev %d:%d trans 0x%x, pass %d, item 0x%p, item type %s " + "item region count/total %d/%d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->tid, + __entry->pass, + (void *)__entry->item, + __print_symbolic(__entry->type, XFS_LI_TYPE_DESC), + __entry->count, + __entry->total) +) + +#define DEFINE_LOG_RECOVER_ITEM(name) \ +DEFINE_EVENT(xfs_log_recover_item_class, name, \ + TP_PROTO(struct xlog *log, struct xlog_recover *trans, \ + struct xlog_recover_item *item, int pass), \ + TP_ARGS(log, trans, item, pass)) + +DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_add); +DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_add_cont); +DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_reorder_head); +DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_reorder_tail); +DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_recover); + +DECLARE_EVENT_CLASS(xfs_log_recover_buf_item_class, + TP_PROTO(struct xlog *log, struct xfs_buf_log_format *buf_f), + TP_ARGS(log, buf_f), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(__int64_t, blkno) + __field(unsigned short, len) + __field(unsigned short, flags) + __field(unsigned short, size) + __field(unsigned int, map_size) + ), + TP_fast_assign( + __entry->dev = log->l_mp->m_super->s_dev; + __entry->blkno = buf_f->blf_blkno; + __entry->len = buf_f->blf_len; + __entry->flags = buf_f->blf_flags; + __entry->size = buf_f->blf_size; + __entry->map_size = buf_f->blf_map_size; + ), + TP_printk("dev %d:%d blkno 0x%llx, len %u, flags 0x%x, size %d, " + "map_size %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->blkno, + __entry->len, + __entry->flags, + __entry->size, + __entry->map_size) +) + +#define DEFINE_LOG_RECOVER_BUF_ITEM(name) \ +DEFINE_EVENT(xfs_log_recover_buf_item_class, name, \ + TP_PROTO(struct xlog *log, struct xfs_buf_log_format *buf_f), \ + TP_ARGS(log, buf_f)) + +DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_not_cancel); +DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel); +DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel_add); +DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel_ref_inc); +DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_recover); +DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_inode_buf); +DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_reg_buf); +DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_dquot_buf); + +DECLARE_EVENT_CLASS(xfs_log_recover_ino_item_class, + TP_PROTO(struct xlog *log, struct xfs_inode_log_format *in_f), + TP_ARGS(log, in_f), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(unsigned short, size) + __field(int, fields) + __field(unsigned short, asize) + __field(unsigned short, dsize) + __field(__int64_t, blkno) + __field(int, len) + __field(int, boffset) + ), + TP_fast_assign( + __entry->dev = log->l_mp->m_super->s_dev; + __entry->ino = in_f->ilf_ino; + __entry->size = in_f->ilf_size; + __entry->fields = in_f->ilf_fields; + __entry->asize = in_f->ilf_asize; + __entry->dsize = in_f->ilf_dsize; + __entry->blkno = in_f->ilf_blkno; + __entry->len = in_f->ilf_len; + __entry->boffset = in_f->ilf_boffset; + ), + TP_printk("dev %d:%d ino 0x%llx, size %u, fields 0x%x, asize %d, " + "dsize %d, blkno 0x%llx, len %d, boffset %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->size, + __entry->fields, + __entry->asize, + __entry->dsize, + __entry->blkno, + __entry->len, + __entry->boffset) +) +#define DEFINE_LOG_RECOVER_INO_ITEM(name) \ +DEFINE_EVENT(xfs_log_recover_ino_item_class, name, \ + TP_PROTO(struct xlog *log, struct xfs_inode_log_format *in_f), \ + TP_ARGS(log, in_f)) + +DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_recover); +DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_cancel); +DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_skip); + +DECLARE_EVENT_CLASS(xfs_discard_class, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t agbno, xfs_extlen_t len), + TP_ARGS(mp, agno, agbno, len), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, len) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->agbno = agbno; + __entry->len = len; + ), + TP_printk("dev %d:%d agno %u agbno %u len %u\n", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agbno, + __entry->len) +) + +#define DEFINE_DISCARD_EVENT(name) \ +DEFINE_EVENT(xfs_discard_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ + xfs_agblock_t agbno, xfs_extlen_t len), \ + TP_ARGS(mp, agno, agbno, len)) +DEFINE_DISCARD_EVENT(xfs_discard_extent); +DEFINE_DISCARD_EVENT(xfs_discard_toosmall); +DEFINE_DISCARD_EVENT(xfs_discard_exclude); +DEFINE_DISCARD_EVENT(xfs_discard_busy); + +#endif /* _TRACE_XFS_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE xfs_trace +#include diff --git a/kernel/fs/xfs/xfs_trans.c b/kernel/fs/xfs/xfs_trans.c new file mode 100644 index 000000000..220ef2c90 --- /dev/null +++ b/kernel/fs/xfs/xfs_trans.c @@ -0,0 +1,1105 @@ +/* + * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * Copyright (C) 2010 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_extent_busy.h" +#include "xfs_quota.h" +#include "xfs_trans.h" +#include "xfs_trans_priv.h" +#include "xfs_log.h" +#include "xfs_trace.h" +#include "xfs_error.h" + +kmem_zone_t *xfs_trans_zone; +kmem_zone_t *xfs_log_item_desc_zone; + +/* + * Initialize the precomputed transaction reservation values + * in the mount structure. + */ +void +xfs_trans_init( + struct xfs_mount *mp) +{ + xfs_trans_resv_calc(mp, M_RES(mp)); +} + +/* + * This routine is called to allocate a transaction structure. + * The type parameter indicates the type of the transaction. These + * are enumerated in xfs_trans.h. + * + * Dynamically allocate the transaction structure from the transaction + * zone, initialize it, and return it to the caller. + */ +xfs_trans_t * +xfs_trans_alloc( + xfs_mount_t *mp, + uint type) +{ + xfs_trans_t *tp; + + sb_start_intwrite(mp->m_super); + tp = _xfs_trans_alloc(mp, type, KM_SLEEP); + tp->t_flags |= XFS_TRANS_FREEZE_PROT; + return tp; +} + +xfs_trans_t * +_xfs_trans_alloc( + xfs_mount_t *mp, + uint type, + xfs_km_flags_t memflags) +{ + xfs_trans_t *tp; + + WARN_ON(mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE); + atomic_inc(&mp->m_active_trans); + + tp = kmem_zone_zalloc(xfs_trans_zone, memflags); + tp->t_magic = XFS_TRANS_HEADER_MAGIC; + tp->t_type = type; + tp->t_mountp = mp; + INIT_LIST_HEAD(&tp->t_items); + INIT_LIST_HEAD(&tp->t_busy); + return tp; +} + +/* + * Free the transaction structure. If there is more clean up + * to do when the structure is freed, add it here. + */ +STATIC void +xfs_trans_free( + struct xfs_trans *tp) +{ + xfs_extent_busy_sort(&tp->t_busy); + xfs_extent_busy_clear(tp->t_mountp, &tp->t_busy, false); + + atomic_dec(&tp->t_mountp->m_active_trans); + if (tp->t_flags & XFS_TRANS_FREEZE_PROT) + sb_end_intwrite(tp->t_mountp->m_super); + xfs_trans_free_dqinfo(tp); + kmem_zone_free(xfs_trans_zone, tp); +} + +/* + * This is called to create a new transaction which will share the + * permanent log reservation of the given transaction. The remaining + * unused block and rt extent reservations are also inherited. This + * implies that the original transaction is no longer allowed to allocate + * blocks. Locks and log items, however, are no inherited. They must + * be added to the new transaction explicitly. + */ +xfs_trans_t * +xfs_trans_dup( + xfs_trans_t *tp) +{ + xfs_trans_t *ntp; + + ntp = kmem_zone_zalloc(xfs_trans_zone, KM_SLEEP); + + /* + * Initialize the new transaction structure. + */ + ntp->t_magic = XFS_TRANS_HEADER_MAGIC; + ntp->t_type = tp->t_type; + ntp->t_mountp = tp->t_mountp; + INIT_LIST_HEAD(&ntp->t_items); + INIT_LIST_HEAD(&ntp->t_busy); + + ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); + ASSERT(tp->t_ticket != NULL); + + ntp->t_flags = XFS_TRANS_PERM_LOG_RES | + (tp->t_flags & XFS_TRANS_RESERVE) | + (tp->t_flags & XFS_TRANS_FREEZE_PROT); + /* We gave our writer reference to the new transaction */ + tp->t_flags &= ~XFS_TRANS_FREEZE_PROT; + ntp->t_ticket = xfs_log_ticket_get(tp->t_ticket); + ntp->t_blk_res = tp->t_blk_res - tp->t_blk_res_used; + tp->t_blk_res = tp->t_blk_res_used; + ntp->t_rtx_res = tp->t_rtx_res - tp->t_rtx_res_used; + tp->t_rtx_res = tp->t_rtx_res_used; + ntp->t_pflags = tp->t_pflags; + + xfs_trans_dup_dqinfo(tp, ntp); + + atomic_inc(&tp->t_mountp->m_active_trans); + return ntp; +} + +/* + * This is called to reserve free disk blocks and log space for the + * given transaction. This must be done before allocating any resources + * within the transaction. + * + * This will return ENOSPC if there are not enough blocks available. + * It will sleep waiting for available log space. + * The only valid value for the flags parameter is XFS_RES_LOG_PERM, which + * is used by long running transactions. If any one of the reservations + * fails then they will all be backed out. + * + * This does not do quota reservations. That typically is done by the + * caller afterwards. + */ +int +xfs_trans_reserve( + struct xfs_trans *tp, + struct xfs_trans_res *resp, + uint blocks, + uint rtextents) +{ + int error = 0; + bool rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0; + + /* Mark this thread as being in a transaction */ + current_set_flags_nested(&tp->t_pflags, PF_FSTRANS); + + /* + * Attempt to reserve the needed disk blocks by decrementing + * the number needed from the number available. This will + * fail if the count would go below zero. + */ + if (blocks > 0) { + error = xfs_mod_fdblocks(tp->t_mountp, -((int64_t)blocks), rsvd); + if (error != 0) { + current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); + return -ENOSPC; + } + tp->t_blk_res += blocks; + } + + /* + * Reserve the log space needed for this transaction. + */ + if (resp->tr_logres > 0) { + bool permanent = false; + + ASSERT(tp->t_log_res == 0 || + tp->t_log_res == resp->tr_logres); + ASSERT(tp->t_log_count == 0 || + tp->t_log_count == resp->tr_logcount); + + if (resp->tr_logflags & XFS_TRANS_PERM_LOG_RES) { + tp->t_flags |= XFS_TRANS_PERM_LOG_RES; + permanent = true; + } else { + ASSERT(tp->t_ticket == NULL); + ASSERT(!(tp->t_flags & XFS_TRANS_PERM_LOG_RES)); + } + + if (tp->t_ticket != NULL) { + ASSERT(resp->tr_logflags & XFS_TRANS_PERM_LOG_RES); + error = xfs_log_regrant(tp->t_mountp, tp->t_ticket); + } else { + error = xfs_log_reserve(tp->t_mountp, + resp->tr_logres, + resp->tr_logcount, + &tp->t_ticket, XFS_TRANSACTION, + permanent, tp->t_type); + } + + if (error) + goto undo_blocks; + + tp->t_log_res = resp->tr_logres; + tp->t_log_count = resp->tr_logcount; + } + + /* + * Attempt to reserve the needed realtime extents by decrementing + * the number needed from the number available. This will + * fail if the count would go below zero. + */ + if (rtextents > 0) { + error = xfs_mod_frextents(tp->t_mountp, -((int64_t)rtextents)); + if (error) { + error = -ENOSPC; + goto undo_log; + } + tp->t_rtx_res += rtextents; + } + + return 0; + + /* + * Error cases jump to one of these labels to undo any + * reservations which have already been performed. + */ +undo_log: + if (resp->tr_logres > 0) { + int log_flags; + + if (resp->tr_logflags & XFS_TRANS_PERM_LOG_RES) { + log_flags = XFS_LOG_REL_PERM_RESERV; + } else { + log_flags = 0; + } + xfs_log_done(tp->t_mountp, tp->t_ticket, NULL, log_flags); + tp->t_ticket = NULL; + tp->t_log_res = 0; + tp->t_flags &= ~XFS_TRANS_PERM_LOG_RES; + } + +undo_blocks: + if (blocks > 0) { + xfs_mod_fdblocks(tp->t_mountp, -((int64_t)blocks), rsvd); + tp->t_blk_res = 0; + } + + current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); + + return error; +} + +/* + * Record the indicated change to the given field for application + * to the file system's superblock when the transaction commits. + * For now, just store the change in the transaction structure. + * + * Mark the transaction structure to indicate that the superblock + * needs to be updated before committing. + * + * Because we may not be keeping track of allocated/free inodes and + * used filesystem blocks in the superblock, we do not mark the + * superblock dirty in this transaction if we modify these fields. + * We still need to update the transaction deltas so that they get + * applied to the incore superblock, but we don't want them to + * cause the superblock to get locked and logged if these are the + * only fields in the superblock that the transaction modifies. + */ +void +xfs_trans_mod_sb( + xfs_trans_t *tp, + uint field, + int64_t delta) +{ + uint32_t flags = (XFS_TRANS_DIRTY|XFS_TRANS_SB_DIRTY); + xfs_mount_t *mp = tp->t_mountp; + + switch (field) { + case XFS_TRANS_SB_ICOUNT: + tp->t_icount_delta += delta; + if (xfs_sb_version_haslazysbcount(&mp->m_sb)) + flags &= ~XFS_TRANS_SB_DIRTY; + break; + case XFS_TRANS_SB_IFREE: + tp->t_ifree_delta += delta; + if (xfs_sb_version_haslazysbcount(&mp->m_sb)) + flags &= ~XFS_TRANS_SB_DIRTY; + break; + case XFS_TRANS_SB_FDBLOCKS: + /* + * Track the number of blocks allocated in the + * transaction. Make sure it does not exceed the + * number reserved. + */ + if (delta < 0) { + tp->t_blk_res_used += (uint)-delta; + ASSERT(tp->t_blk_res_used <= tp->t_blk_res); + } + tp->t_fdblocks_delta += delta; + if (xfs_sb_version_haslazysbcount(&mp->m_sb)) + flags &= ~XFS_TRANS_SB_DIRTY; + break; + case XFS_TRANS_SB_RES_FDBLOCKS: + /* + * The allocation has already been applied to the + * in-core superblock's counter. This should only + * be applied to the on-disk superblock. + */ + ASSERT(delta < 0); + tp->t_res_fdblocks_delta += delta; + if (xfs_sb_version_haslazysbcount(&mp->m_sb)) + flags &= ~XFS_TRANS_SB_DIRTY; + break; + case XFS_TRANS_SB_FREXTENTS: + /* + * Track the number of blocks allocated in the + * transaction. Make sure it does not exceed the + * number reserved. + */ + if (delta < 0) { + tp->t_rtx_res_used += (uint)-delta; + ASSERT(tp->t_rtx_res_used <= tp->t_rtx_res); + } + tp->t_frextents_delta += delta; + break; + case XFS_TRANS_SB_RES_FREXTENTS: + /* + * The allocation has already been applied to the + * in-core superblock's counter. This should only + * be applied to the on-disk superblock. + */ + ASSERT(delta < 0); + tp->t_res_frextents_delta += delta; + break; + case XFS_TRANS_SB_DBLOCKS: + ASSERT(delta > 0); + tp->t_dblocks_delta += delta; + break; + case XFS_TRANS_SB_AGCOUNT: + ASSERT(delta > 0); + tp->t_agcount_delta += delta; + break; + case XFS_TRANS_SB_IMAXPCT: + tp->t_imaxpct_delta += delta; + break; + case XFS_TRANS_SB_REXTSIZE: + tp->t_rextsize_delta += delta; + break; + case XFS_TRANS_SB_RBMBLOCKS: + tp->t_rbmblocks_delta += delta; + break; + case XFS_TRANS_SB_RBLOCKS: + tp->t_rblocks_delta += delta; + break; + case XFS_TRANS_SB_REXTENTS: + tp->t_rextents_delta += delta; + break; + case XFS_TRANS_SB_REXTSLOG: + tp->t_rextslog_delta += delta; + break; + default: + ASSERT(0); + return; + } + + tp->t_flags |= flags; +} + +/* + * xfs_trans_apply_sb_deltas() is called from the commit code + * to bring the superblock buffer into the current transaction + * and modify it as requested by earlier calls to xfs_trans_mod_sb(). + * + * For now we just look at each field allowed to change and change + * it if necessary. + */ +STATIC void +xfs_trans_apply_sb_deltas( + xfs_trans_t *tp) +{ + xfs_dsb_t *sbp; + xfs_buf_t *bp; + int whole = 0; + + bp = xfs_trans_getsb(tp, tp->t_mountp, 0); + sbp = XFS_BUF_TO_SBP(bp); + + /* + * Check that superblock mods match the mods made to AGF counters. + */ + ASSERT((tp->t_fdblocks_delta + tp->t_res_fdblocks_delta) == + (tp->t_ag_freeblks_delta + tp->t_ag_flist_delta + + tp->t_ag_btree_delta)); + + /* + * Only update the superblock counters if we are logging them + */ + if (!xfs_sb_version_haslazysbcount(&(tp->t_mountp->m_sb))) { + if (tp->t_icount_delta) + be64_add_cpu(&sbp->sb_icount, tp->t_icount_delta); + if (tp->t_ifree_delta) + be64_add_cpu(&sbp->sb_ifree, tp->t_ifree_delta); + if (tp->t_fdblocks_delta) + be64_add_cpu(&sbp->sb_fdblocks, tp->t_fdblocks_delta); + if (tp->t_res_fdblocks_delta) + be64_add_cpu(&sbp->sb_fdblocks, tp->t_res_fdblocks_delta); + } + + if (tp->t_frextents_delta) + be64_add_cpu(&sbp->sb_frextents, tp->t_frextents_delta); + if (tp->t_res_frextents_delta) + be64_add_cpu(&sbp->sb_frextents, tp->t_res_frextents_delta); + + if (tp->t_dblocks_delta) { + be64_add_cpu(&sbp->sb_dblocks, tp->t_dblocks_delta); + whole = 1; + } + if (tp->t_agcount_delta) { + be32_add_cpu(&sbp->sb_agcount, tp->t_agcount_delta); + whole = 1; + } + if (tp->t_imaxpct_delta) { + sbp->sb_imax_pct += tp->t_imaxpct_delta; + whole = 1; + } + if (tp->t_rextsize_delta) { + be32_add_cpu(&sbp->sb_rextsize, tp->t_rextsize_delta); + whole = 1; + } + if (tp->t_rbmblocks_delta) { + be32_add_cpu(&sbp->sb_rbmblocks, tp->t_rbmblocks_delta); + whole = 1; + } + if (tp->t_rblocks_delta) { + be64_add_cpu(&sbp->sb_rblocks, tp->t_rblocks_delta); + whole = 1; + } + if (tp->t_rextents_delta) { + be64_add_cpu(&sbp->sb_rextents, tp->t_rextents_delta); + whole = 1; + } + if (tp->t_rextslog_delta) { + sbp->sb_rextslog += tp->t_rextslog_delta; + whole = 1; + } + + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF); + if (whole) + /* + * Log the whole thing, the fields are noncontiguous. + */ + xfs_trans_log_buf(tp, bp, 0, sizeof(xfs_dsb_t) - 1); + else + /* + * Since all the modifiable fields are contiguous, we + * can get away with this. + */ + xfs_trans_log_buf(tp, bp, offsetof(xfs_dsb_t, sb_icount), + offsetof(xfs_dsb_t, sb_frextents) + + sizeof(sbp->sb_frextents) - 1); +} + +STATIC int +xfs_sb_mod8( + uint8_t *field, + int8_t delta) +{ + int8_t counter = *field; + + counter += delta; + if (counter < 0) { + ASSERT(0); + return -EINVAL; + } + *field = counter; + return 0; +} + +STATIC int +xfs_sb_mod32( + uint32_t *field, + int32_t delta) +{ + int32_t counter = *field; + + counter += delta; + if (counter < 0) { + ASSERT(0); + return -EINVAL; + } + *field = counter; + return 0; +} + +STATIC int +xfs_sb_mod64( + uint64_t *field, + int64_t delta) +{ + int64_t counter = *field; + + counter += delta; + if (counter < 0) { + ASSERT(0); + return -EINVAL; + } + *field = counter; + return 0; +} + +/* + * xfs_trans_unreserve_and_mod_sb() is called to release unused reservations + * and apply superblock counter changes to the in-core superblock. The + * t_res_fdblocks_delta and t_res_frextents_delta fields are explicitly NOT + * applied to the in-core superblock. The idea is that that has already been + * done. + * + * If we are not logging superblock counters, then the inode allocated/free and + * used block counts are not updated in the on disk superblock. In this case, + * XFS_TRANS_SB_DIRTY will not be set when the transaction is updated but we + * still need to update the incore superblock with the changes. + */ +void +xfs_trans_unreserve_and_mod_sb( + struct xfs_trans *tp) +{ + struct xfs_mount *mp = tp->t_mountp; + bool rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0; + int64_t blkdelta = 0; + int64_t rtxdelta = 0; + int64_t idelta = 0; + int64_t ifreedelta = 0; + int error; + + /* calculate deltas */ + if (tp->t_blk_res > 0) + blkdelta = tp->t_blk_res; + if ((tp->t_fdblocks_delta != 0) && + (xfs_sb_version_haslazysbcount(&mp->m_sb) || + (tp->t_flags & XFS_TRANS_SB_DIRTY))) + blkdelta += tp->t_fdblocks_delta; + + if (tp->t_rtx_res > 0) + rtxdelta = tp->t_rtx_res; + if ((tp->t_frextents_delta != 0) && + (tp->t_flags & XFS_TRANS_SB_DIRTY)) + rtxdelta += tp->t_frextents_delta; + + if (xfs_sb_version_haslazysbcount(&mp->m_sb) || + (tp->t_flags & XFS_TRANS_SB_DIRTY)) { + idelta = tp->t_icount_delta; + ifreedelta = tp->t_ifree_delta; + } + + /* apply the per-cpu counters */ + if (blkdelta) { + error = xfs_mod_fdblocks(mp, blkdelta, rsvd); + if (error) + goto out; + } + + if (idelta) { + error = xfs_mod_icount(mp, idelta); + if (error) + goto out_undo_fdblocks; + } + + if (ifreedelta) { + error = xfs_mod_ifree(mp, ifreedelta); + if (error) + goto out_undo_icount; + } + + if (rtxdelta == 0 && !(tp->t_flags & XFS_TRANS_SB_DIRTY)) + return; + + /* apply remaining deltas */ + spin_lock(&mp->m_sb_lock); + if (rtxdelta) { + error = xfs_sb_mod64(&mp->m_sb.sb_frextents, rtxdelta); + if (error) + goto out_undo_ifree; + } + + if (tp->t_dblocks_delta != 0) { + error = xfs_sb_mod64(&mp->m_sb.sb_dblocks, tp->t_dblocks_delta); + if (error) + goto out_undo_frextents; + } + if (tp->t_agcount_delta != 0) { + error = xfs_sb_mod32(&mp->m_sb.sb_agcount, tp->t_agcount_delta); + if (error) + goto out_undo_dblocks; + } + if (tp->t_imaxpct_delta != 0) { + error = xfs_sb_mod8(&mp->m_sb.sb_imax_pct, tp->t_imaxpct_delta); + if (error) + goto out_undo_agcount; + } + if (tp->t_rextsize_delta != 0) { + error = xfs_sb_mod32(&mp->m_sb.sb_rextsize, + tp->t_rextsize_delta); + if (error) + goto out_undo_imaxpct; + } + if (tp->t_rbmblocks_delta != 0) { + error = xfs_sb_mod32(&mp->m_sb.sb_rbmblocks, + tp->t_rbmblocks_delta); + if (error) + goto out_undo_rextsize; + } + if (tp->t_rblocks_delta != 0) { + error = xfs_sb_mod64(&mp->m_sb.sb_rblocks, tp->t_rblocks_delta); + if (error) + goto out_undo_rbmblocks; + } + if (tp->t_rextents_delta != 0) { + error = xfs_sb_mod64(&mp->m_sb.sb_rextents, + tp->t_rextents_delta); + if (error) + goto out_undo_rblocks; + } + if (tp->t_rextslog_delta != 0) { + error = xfs_sb_mod8(&mp->m_sb.sb_rextslog, + tp->t_rextslog_delta); + if (error) + goto out_undo_rextents; + } + spin_unlock(&mp->m_sb_lock); + return; + +out_undo_rextents: + if (tp->t_rextents_delta) + xfs_sb_mod64(&mp->m_sb.sb_rextents, -tp->t_rextents_delta); +out_undo_rblocks: + if (tp->t_rblocks_delta) + xfs_sb_mod64(&mp->m_sb.sb_rblocks, -tp->t_rblocks_delta); +out_undo_rbmblocks: + if (tp->t_rbmblocks_delta) + xfs_sb_mod32(&mp->m_sb.sb_rbmblocks, -tp->t_rbmblocks_delta); +out_undo_rextsize: + if (tp->t_rextsize_delta) + xfs_sb_mod32(&mp->m_sb.sb_rextsize, -tp->t_rextsize_delta); +out_undo_imaxpct: + if (tp->t_rextsize_delta) + xfs_sb_mod8(&mp->m_sb.sb_imax_pct, -tp->t_imaxpct_delta); +out_undo_agcount: + if (tp->t_agcount_delta) + xfs_sb_mod32(&mp->m_sb.sb_agcount, -tp->t_agcount_delta); +out_undo_dblocks: + if (tp->t_dblocks_delta) + xfs_sb_mod64(&mp->m_sb.sb_dblocks, -tp->t_dblocks_delta); +out_undo_frextents: + if (rtxdelta) + xfs_sb_mod64(&mp->m_sb.sb_frextents, -rtxdelta); +out_undo_ifree: + spin_unlock(&mp->m_sb_lock); + if (ifreedelta) + xfs_mod_ifree(mp, -ifreedelta); +out_undo_icount: + if (idelta) + xfs_mod_icount(mp, -idelta); +out_undo_fdblocks: + if (blkdelta) + xfs_mod_fdblocks(mp, -blkdelta, rsvd); +out: + ASSERT(error == 0); + return; +} + +/* + * Add the given log item to the transaction's list of log items. + * + * The log item will now point to its new descriptor with its li_desc field. + */ +void +xfs_trans_add_item( + struct xfs_trans *tp, + struct xfs_log_item *lip) +{ + struct xfs_log_item_desc *lidp; + + ASSERT(lip->li_mountp == tp->t_mountp); + ASSERT(lip->li_ailp == tp->t_mountp->m_ail); + + lidp = kmem_zone_zalloc(xfs_log_item_desc_zone, KM_SLEEP | KM_NOFS); + + lidp->lid_item = lip; + lidp->lid_flags = 0; + list_add_tail(&lidp->lid_trans, &tp->t_items); + + lip->li_desc = lidp; +} + +STATIC void +xfs_trans_free_item_desc( + struct xfs_log_item_desc *lidp) +{ + list_del_init(&lidp->lid_trans); + kmem_zone_free(xfs_log_item_desc_zone, lidp); +} + +/* + * Unlink and free the given descriptor. + */ +void +xfs_trans_del_item( + struct xfs_log_item *lip) +{ + xfs_trans_free_item_desc(lip->li_desc); + lip->li_desc = NULL; +} + +/* + * Unlock all of the items of a transaction and free all the descriptors + * of that transaction. + */ +void +xfs_trans_free_items( + struct xfs_trans *tp, + xfs_lsn_t commit_lsn, + int flags) +{ + struct xfs_log_item_desc *lidp, *next; + + list_for_each_entry_safe(lidp, next, &tp->t_items, lid_trans) { + struct xfs_log_item *lip = lidp->lid_item; + + lip->li_desc = NULL; + + if (commit_lsn != NULLCOMMITLSN) + lip->li_ops->iop_committing(lip, commit_lsn); + if (flags & XFS_TRANS_ABORT) + lip->li_flags |= XFS_LI_ABORTED; + lip->li_ops->iop_unlock(lip); + + xfs_trans_free_item_desc(lidp); + } +} + +static inline void +xfs_log_item_batch_insert( + struct xfs_ail *ailp, + struct xfs_ail_cursor *cur, + struct xfs_log_item **log_items, + int nr_items, + xfs_lsn_t commit_lsn) +{ + int i; + + spin_lock(&ailp->xa_lock); + /* xfs_trans_ail_update_bulk drops ailp->xa_lock */ + xfs_trans_ail_update_bulk(ailp, cur, log_items, nr_items, commit_lsn); + + for (i = 0; i < nr_items; i++) { + struct xfs_log_item *lip = log_items[i]; + + lip->li_ops->iop_unpin(lip, 0); + } +} + +/* + * Bulk operation version of xfs_trans_committed that takes a log vector of + * items to insert into the AIL. This uses bulk AIL insertion techniques to + * minimise lock traffic. + * + * If we are called with the aborted flag set, it is because a log write during + * a CIL checkpoint commit has failed. In this case, all the items in the + * checkpoint have already gone through iop_commited and iop_unlock, which + * means that checkpoint commit abort handling is treated exactly the same + * as an iclog write error even though we haven't started any IO yet. Hence in + * this case all we need to do is iop_committed processing, followed by an + * iop_unpin(aborted) call. + * + * The AIL cursor is used to optimise the insert process. If commit_lsn is not + * at the end of the AIL, the insert cursor avoids the need to walk + * the AIL to find the insertion point on every xfs_log_item_batch_insert() + * call. This saves a lot of needless list walking and is a net win, even + * though it slightly increases that amount of AIL lock traffic to set it up + * and tear it down. + */ +void +xfs_trans_committed_bulk( + struct xfs_ail *ailp, + struct xfs_log_vec *log_vector, + xfs_lsn_t commit_lsn, + int aborted) +{ +#define LOG_ITEM_BATCH_SIZE 32 + struct xfs_log_item *log_items[LOG_ITEM_BATCH_SIZE]; + struct xfs_log_vec *lv; + struct xfs_ail_cursor cur; + int i = 0; + + spin_lock(&ailp->xa_lock); + xfs_trans_ail_cursor_last(ailp, &cur, commit_lsn); + spin_unlock(&ailp->xa_lock); + + /* unpin all the log items */ + for (lv = log_vector; lv; lv = lv->lv_next ) { + struct xfs_log_item *lip = lv->lv_item; + xfs_lsn_t item_lsn; + + if (aborted) + lip->li_flags |= XFS_LI_ABORTED; + item_lsn = lip->li_ops->iop_committed(lip, commit_lsn); + + /* item_lsn of -1 means the item needs no further processing */ + if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0) + continue; + + /* + * if we are aborting the operation, no point in inserting the + * object into the AIL as we are in a shutdown situation. + */ + if (aborted) { + ASSERT(XFS_FORCED_SHUTDOWN(ailp->xa_mount)); + lip->li_ops->iop_unpin(lip, 1); + continue; + } + + if (item_lsn != commit_lsn) { + + /* + * Not a bulk update option due to unusual item_lsn. + * Push into AIL immediately, rechecking the lsn once + * we have the ail lock. Then unpin the item. This does + * not affect the AIL cursor the bulk insert path is + * using. + */ + spin_lock(&ailp->xa_lock); + if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0) + xfs_trans_ail_update(ailp, lip, item_lsn); + else + spin_unlock(&ailp->xa_lock); + lip->li_ops->iop_unpin(lip, 0); + continue; + } + + /* Item is a candidate for bulk AIL insert. */ + log_items[i++] = lv->lv_item; + if (i >= LOG_ITEM_BATCH_SIZE) { + xfs_log_item_batch_insert(ailp, &cur, log_items, + LOG_ITEM_BATCH_SIZE, commit_lsn); + i = 0; + } + } + + /* make sure we insert the remainder! */ + if (i) + xfs_log_item_batch_insert(ailp, &cur, log_items, i, commit_lsn); + + spin_lock(&ailp->xa_lock); + xfs_trans_ail_cursor_done(&cur); + spin_unlock(&ailp->xa_lock); +} + +/* + * Commit the given transaction to the log. + * + * XFS disk error handling mechanism is not based on a typical + * transaction abort mechanism. Logically after the filesystem + * gets marked 'SHUTDOWN', we can't let any new transactions + * be durable - ie. committed to disk - because some metadata might + * be inconsistent. In such cases, this returns an error, and the + * caller may assume that all locked objects joined to the transaction + * have already been unlocked as if the commit had succeeded. + * Do not reference the transaction structure after this call. + */ +int +xfs_trans_commit( + struct xfs_trans *tp, + uint flags) +{ + struct xfs_mount *mp = tp->t_mountp; + xfs_lsn_t commit_lsn = -1; + int error = 0; + int log_flags = 0; + int sync = tp->t_flags & XFS_TRANS_SYNC; + + /* + * Determine whether this commit is releasing a permanent + * log reservation or not. + */ + if (flags & XFS_TRANS_RELEASE_LOG_RES) { + ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); + log_flags = XFS_LOG_REL_PERM_RESERV; + } + + /* + * If there is nothing to be logged by the transaction, + * then unlock all of the items associated with the + * transaction and free the transaction structure. + * Also make sure to return any reserved blocks to + * the free pool. + */ + if (!(tp->t_flags & XFS_TRANS_DIRTY)) + goto out_unreserve; + + if (XFS_FORCED_SHUTDOWN(mp)) { + error = -EIO; + goto out_unreserve; + } + + ASSERT(tp->t_ticket != NULL); + + /* + * If we need to update the superblock, then do it now. + */ + if (tp->t_flags & XFS_TRANS_SB_DIRTY) + xfs_trans_apply_sb_deltas(tp); + xfs_trans_apply_dquot_deltas(tp); + + xfs_log_commit_cil(mp, tp, &commit_lsn, flags); + + current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); + xfs_trans_free(tp); + + /* + * If the transaction needs to be synchronous, then force the + * log out now and wait for it. + */ + if (sync) { + error = _xfs_log_force_lsn(mp, commit_lsn, XFS_LOG_SYNC, NULL); + XFS_STATS_INC(xs_trans_sync); + } else { + XFS_STATS_INC(xs_trans_async); + } + + return error; + +out_unreserve: + xfs_trans_unreserve_and_mod_sb(tp); + + /* + * It is indeed possible for the transaction to be not dirty but + * the dqinfo portion to be. All that means is that we have some + * (non-persistent) quota reservations that need to be unreserved. + */ + xfs_trans_unreserve_and_mod_dquots(tp); + if (tp->t_ticket) { + commit_lsn = xfs_log_done(mp, tp->t_ticket, NULL, log_flags); + if (commit_lsn == -1 && !error) + error = -EIO; + } + current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); + xfs_trans_free_items(tp, NULLCOMMITLSN, error ? XFS_TRANS_ABORT : 0); + xfs_trans_free(tp); + + XFS_STATS_INC(xs_trans_empty); + return error; +} + +/* + * Unlock all of the transaction's items and free the transaction. + * The transaction must not have modified any of its items, because + * there is no way to restore them to their previous state. + * + * If the transaction has made a log reservation, make sure to release + * it as well. + */ +void +xfs_trans_cancel( + xfs_trans_t *tp, + int flags) +{ + int log_flags; + xfs_mount_t *mp = tp->t_mountp; + + /* + * See if the caller is being too lazy to figure out if + * the transaction really needs an abort. + */ + if ((flags & XFS_TRANS_ABORT) && !(tp->t_flags & XFS_TRANS_DIRTY)) + flags &= ~XFS_TRANS_ABORT; + /* + * See if the caller is relying on us to shut down the + * filesystem. This happens in paths where we detect + * corruption and decide to give up. + */ + if ((tp->t_flags & XFS_TRANS_DIRTY) && !XFS_FORCED_SHUTDOWN(mp)) { + XFS_ERROR_REPORT("xfs_trans_cancel", XFS_ERRLEVEL_LOW, mp); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + } +#ifdef DEBUG + if (!(flags & XFS_TRANS_ABORT) && !XFS_FORCED_SHUTDOWN(mp)) { + struct xfs_log_item_desc *lidp; + + list_for_each_entry(lidp, &tp->t_items, lid_trans) + ASSERT(!(lidp->lid_item->li_type == XFS_LI_EFD)); + } +#endif + xfs_trans_unreserve_and_mod_sb(tp); + xfs_trans_unreserve_and_mod_dquots(tp); + + if (tp->t_ticket) { + if (flags & XFS_TRANS_RELEASE_LOG_RES) { + ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); + log_flags = XFS_LOG_REL_PERM_RESERV; + } else { + log_flags = 0; + } + xfs_log_done(mp, tp->t_ticket, NULL, log_flags); + } + + /* mark this thread as no longer being in a transaction */ + current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); + + xfs_trans_free_items(tp, NULLCOMMITLSN, flags); + xfs_trans_free(tp); +} + +/* + * Roll from one trans in the sequence of PERMANENT transactions to + * the next: permanent transactions are only flushed out when + * committed with XFS_TRANS_RELEASE_LOG_RES, but we still want as soon + * as possible to let chunks of it go to the log. So we commit the + * chunk we've been working on and get a new transaction to continue. + */ +int +xfs_trans_roll( + struct xfs_trans **tpp, + struct xfs_inode *dp) +{ + struct xfs_trans *trans; + struct xfs_trans_res tres; + int error; + + /* + * Ensure that the inode is always logged. + */ + trans = *tpp; + xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE); + + /* + * Copy the critical parameters from one trans to the next. + */ + tres.tr_logres = trans->t_log_res; + tres.tr_logcount = trans->t_log_count; + *tpp = xfs_trans_dup(trans); + + /* + * Commit the current transaction. + * If this commit failed, then it'd just unlock those items that + * are not marked ihold. That also means that a filesystem shutdown + * is in progress. The caller takes the responsibility to cancel + * the duplicate transaction that gets returned. + */ + error = xfs_trans_commit(trans, 0); + if (error) + return error; + + trans = *tpp; + + /* + * transaction commit worked ok so we can drop the extra ticket + * reference that we gained in xfs_trans_dup() + */ + xfs_log_ticket_put(trans->t_ticket); + + + /* + * Reserve space in the log for th next transaction. + * This also pushes items in the "AIL", the list of logged items, + * out to disk if they are taking up space at the tail of the log + * that we want to use. This requires that either nothing be locked + * across this call, or that anything that is locked be logged in + * the prior and the next transactions. + */ + tres.tr_logflags = XFS_TRANS_PERM_LOG_RES; + error = xfs_trans_reserve(trans, &tres, 0, 0); + /* + * Ensure that the inode is in the new transaction and locked. + */ + if (error) + return error; + + xfs_trans_ijoin(trans, dp, 0); + return 0; +} diff --git a/kernel/fs/xfs/xfs_trans.h b/kernel/fs/xfs/xfs_trans.h new file mode 100644 index 000000000..b5bc1ab3c --- /dev/null +++ b/kernel/fs/xfs/xfs_trans.h @@ -0,0 +1,245 @@ +/* + * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_TRANS_H__ +#define __XFS_TRANS_H__ + +/* kernel only transaction subsystem defines */ + +struct xfs_buf; +struct xfs_buftarg; +struct xfs_efd_log_item; +struct xfs_efi_log_item; +struct xfs_inode; +struct xfs_item_ops; +struct xfs_log_iovec; +struct xfs_log_item_desc; +struct xfs_mount; +struct xfs_trans; +struct xfs_trans_res; +struct xfs_dquot_acct; +struct xfs_busy_extent; + +typedef struct xfs_log_item { + struct list_head li_ail; /* AIL pointers */ + xfs_lsn_t li_lsn; /* last on-disk lsn */ + struct xfs_log_item_desc *li_desc; /* ptr to current desc*/ + struct xfs_mount *li_mountp; /* ptr to fs mount */ + struct xfs_ail *li_ailp; /* ptr to AIL */ + uint li_type; /* item type */ + uint li_flags; /* misc flags */ + struct xfs_log_item *li_bio_list; /* buffer item list */ + void (*li_cb)(struct xfs_buf *, + struct xfs_log_item *); + /* buffer item iodone */ + /* callback func */ + const struct xfs_item_ops *li_ops; /* function list */ + + /* delayed logging */ + struct list_head li_cil; /* CIL pointers */ + struct xfs_log_vec *li_lv; /* active log vector */ + xfs_lsn_t li_seq; /* CIL commit seq */ +} xfs_log_item_t; + +#define XFS_LI_IN_AIL 0x1 +#define XFS_LI_ABORTED 0x2 + +#define XFS_LI_FLAGS \ + { XFS_LI_IN_AIL, "IN_AIL" }, \ + { XFS_LI_ABORTED, "ABORTED" } + +struct xfs_item_ops { + void (*iop_size)(xfs_log_item_t *, int *, int *); + void (*iop_format)(xfs_log_item_t *, struct xfs_log_vec *); + void (*iop_pin)(xfs_log_item_t *); + void (*iop_unpin)(xfs_log_item_t *, int remove); + uint (*iop_push)(struct xfs_log_item *, struct list_head *); + void (*iop_unlock)(xfs_log_item_t *); + xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t); + void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t); +}; + +void xfs_log_item_init(struct xfs_mount *mp, struct xfs_log_item *item, + int type, const struct xfs_item_ops *ops); + +/* + * Return values for the iop_push() routines. + */ +#define XFS_ITEM_SUCCESS 0 +#define XFS_ITEM_PINNED 1 +#define XFS_ITEM_LOCKED 2 +#define XFS_ITEM_FLUSHING 3 + + +/* + * This is the structure maintained for every active transaction. + */ +typedef struct xfs_trans { + unsigned int t_magic; /* magic number */ + unsigned int t_type; /* transaction type */ + unsigned int t_log_res; /* amt of log space resvd */ + unsigned int t_log_count; /* count for perm log res */ + unsigned int t_blk_res; /* # of blocks resvd */ + unsigned int t_blk_res_used; /* # of resvd blocks used */ + unsigned int t_rtx_res; /* # of rt extents resvd */ + unsigned int t_rtx_res_used; /* # of resvd rt extents used */ + struct xlog_ticket *t_ticket; /* log mgr ticket */ + xfs_lsn_t t_lsn; /* log seq num of start of + * transaction. */ + xfs_lsn_t t_commit_lsn; /* log seq num of end of + * transaction. */ + struct xfs_mount *t_mountp; /* ptr to fs mount struct */ + struct xfs_dquot_acct *t_dqinfo; /* acctg info for dquots */ + unsigned int t_flags; /* misc flags */ + int64_t t_icount_delta; /* superblock icount change */ + int64_t t_ifree_delta; /* superblock ifree change */ + int64_t t_fdblocks_delta; /* superblock fdblocks chg */ + int64_t t_res_fdblocks_delta; /* on-disk only chg */ + int64_t t_frextents_delta;/* superblock freextents chg*/ + int64_t t_res_frextents_delta; /* on-disk only chg */ +#if defined(DEBUG) || defined(XFS_WARN) + int64_t t_ag_freeblks_delta; /* debugging counter */ + int64_t t_ag_flist_delta; /* debugging counter */ + int64_t t_ag_btree_delta; /* debugging counter */ +#endif + int64_t t_dblocks_delta;/* superblock dblocks change */ + int64_t t_agcount_delta;/* superblock agcount change */ + int64_t t_imaxpct_delta;/* superblock imaxpct change */ + int64_t t_rextsize_delta;/* superblock rextsize chg */ + int64_t t_rbmblocks_delta;/* superblock rbmblocks chg */ + int64_t t_rblocks_delta;/* superblock rblocks change */ + int64_t t_rextents_delta;/* superblocks rextents chg */ + int64_t t_rextslog_delta;/* superblocks rextslog chg */ + struct list_head t_items; /* log item descriptors */ + struct list_head t_busy; /* list of busy extents */ + unsigned long t_pflags; /* saved process flags state */ +} xfs_trans_t; + +/* + * XFS transaction mechanism exported interfaces that are + * actually macros. + */ +#define xfs_trans_get_log_res(tp) ((tp)->t_log_res) +#define xfs_trans_get_log_count(tp) ((tp)->t_log_count) +#define xfs_trans_get_block_res(tp) ((tp)->t_blk_res) +#define xfs_trans_set_sync(tp) ((tp)->t_flags |= XFS_TRANS_SYNC) + +#if defined(DEBUG) || defined(XFS_WARN) +#define xfs_trans_agblocks_delta(tp, d) ((tp)->t_ag_freeblks_delta += (int64_t)d) +#define xfs_trans_agflist_delta(tp, d) ((tp)->t_ag_flist_delta += (int64_t)d) +#define xfs_trans_agbtree_delta(tp, d) ((tp)->t_ag_btree_delta += (int64_t)d) +#else +#define xfs_trans_agblocks_delta(tp, d) +#define xfs_trans_agflist_delta(tp, d) +#define xfs_trans_agbtree_delta(tp, d) +#endif + +/* + * XFS transaction mechanism exported interfaces. + */ +xfs_trans_t *xfs_trans_alloc(struct xfs_mount *, uint); +xfs_trans_t *_xfs_trans_alloc(struct xfs_mount *, uint, xfs_km_flags_t); +xfs_trans_t *xfs_trans_dup(xfs_trans_t *); +int xfs_trans_reserve(struct xfs_trans *, struct xfs_trans_res *, + uint, uint); +void xfs_trans_mod_sb(xfs_trans_t *, uint, int64_t); + +struct xfs_buf *xfs_trans_get_buf_map(struct xfs_trans *tp, + struct xfs_buftarg *target, + struct xfs_buf_map *map, int nmaps, + uint flags); + +static inline struct xfs_buf * +xfs_trans_get_buf( + struct xfs_trans *tp, + struct xfs_buftarg *target, + xfs_daddr_t blkno, + int numblks, + uint flags) +{ + DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); + return xfs_trans_get_buf_map(tp, target, &map, 1, flags); +} + +int xfs_trans_read_buf_map(struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_buftarg *target, + struct xfs_buf_map *map, int nmaps, + xfs_buf_flags_t flags, + struct xfs_buf **bpp, + const struct xfs_buf_ops *ops); + +static inline int +xfs_trans_read_buf( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_buftarg *target, + xfs_daddr_t blkno, + int numblks, + xfs_buf_flags_t flags, + struct xfs_buf **bpp, + const struct xfs_buf_ops *ops) +{ + DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); + return xfs_trans_read_buf_map(mp, tp, target, &map, 1, + flags, bpp, ops); +} + +struct xfs_buf *xfs_trans_getsb(xfs_trans_t *, struct xfs_mount *, int); + +void xfs_trans_brelse(xfs_trans_t *, struct xfs_buf *); +void xfs_trans_bjoin(xfs_trans_t *, struct xfs_buf *); +void xfs_trans_bhold(xfs_trans_t *, struct xfs_buf *); +void xfs_trans_bhold_release(xfs_trans_t *, struct xfs_buf *); +void xfs_trans_binval(xfs_trans_t *, struct xfs_buf *); +void xfs_trans_inode_buf(xfs_trans_t *, struct xfs_buf *); +void xfs_trans_stale_inode_buf(xfs_trans_t *, struct xfs_buf *); +void xfs_trans_ordered_buf(xfs_trans_t *, struct xfs_buf *); +void xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint); +void xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *); +void xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int); +void xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *, uint); +void xfs_trans_log_buf(xfs_trans_t *, struct xfs_buf *, uint, uint); +void xfs_trans_log_inode(xfs_trans_t *, struct xfs_inode *, uint); +struct xfs_efi_log_item *xfs_trans_get_efi(xfs_trans_t *, uint); +void xfs_efi_release(struct xfs_efi_log_item *, uint); +void xfs_trans_log_efi_extent(xfs_trans_t *, + struct xfs_efi_log_item *, + xfs_fsblock_t, + xfs_extlen_t); +struct xfs_efd_log_item *xfs_trans_get_efd(xfs_trans_t *, + struct xfs_efi_log_item *, + uint); +void xfs_trans_log_efd_extent(xfs_trans_t *, + struct xfs_efd_log_item *, + xfs_fsblock_t, + xfs_extlen_t); +int xfs_trans_commit(xfs_trans_t *, uint flags); +int xfs_trans_roll(struct xfs_trans **, struct xfs_inode *); +void xfs_trans_cancel(xfs_trans_t *, int); +int xfs_trans_ail_init(struct xfs_mount *); +void xfs_trans_ail_destroy(struct xfs_mount *); + +void xfs_trans_buf_set_type(struct xfs_trans *, struct xfs_buf *, + enum xfs_blft); +void xfs_trans_buf_copy_type(struct xfs_buf *dst_bp, + struct xfs_buf *src_bp); + +extern kmem_zone_t *xfs_trans_zone; +extern kmem_zone_t *xfs_log_item_desc_zone; + +#endif /* __XFS_TRANS_H__ */ diff --git a/kernel/fs/xfs/xfs_trans_ail.c b/kernel/fs/xfs/xfs_trans_ail.c new file mode 100644 index 000000000..573aefb5a --- /dev/null +++ b/kernel/fs/xfs/xfs_trans_ail.c @@ -0,0 +1,794 @@ +/* + * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. + * Copyright (c) 2008 Dave Chinner + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_trans.h" +#include "xfs_trans_priv.h" +#include "xfs_trace.h" +#include "xfs_error.h" +#include "xfs_log.h" + +#ifdef DEBUG +/* + * Check that the list is sorted as it should be. + */ +STATIC void +xfs_ail_check( + struct xfs_ail *ailp, + xfs_log_item_t *lip) +{ + xfs_log_item_t *prev_lip; + + if (list_empty(&ailp->xa_ail)) + return; + + /* + * Check the next and previous entries are valid. + */ + ASSERT((lip->li_flags & XFS_LI_IN_AIL) != 0); + prev_lip = list_entry(lip->li_ail.prev, xfs_log_item_t, li_ail); + if (&prev_lip->li_ail != &ailp->xa_ail) + ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0); + + prev_lip = list_entry(lip->li_ail.next, xfs_log_item_t, li_ail); + if (&prev_lip->li_ail != &ailp->xa_ail) + ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) >= 0); + + +} +#else /* !DEBUG */ +#define xfs_ail_check(a,l) +#endif /* DEBUG */ + +/* + * Return a pointer to the last item in the AIL. If the AIL is empty, then + * return NULL. + */ +static xfs_log_item_t * +xfs_ail_max( + struct xfs_ail *ailp) +{ + if (list_empty(&ailp->xa_ail)) + return NULL; + + return list_entry(ailp->xa_ail.prev, xfs_log_item_t, li_ail); +} + +/* + * Return a pointer to the item which follows the given item in the AIL. If + * the given item is the last item in the list, then return NULL. + */ +static xfs_log_item_t * +xfs_ail_next( + struct xfs_ail *ailp, + xfs_log_item_t *lip) +{ + if (lip->li_ail.next == &ailp->xa_ail) + return NULL; + + return list_first_entry(&lip->li_ail, xfs_log_item_t, li_ail); +} + +/* + * This is called by the log manager code to determine the LSN of the tail of + * the log. This is exactly the LSN of the first item in the AIL. If the AIL + * is empty, then this function returns 0. + * + * We need the AIL lock in order to get a coherent read of the lsn of the last + * item in the AIL. + */ +xfs_lsn_t +xfs_ail_min_lsn( + struct xfs_ail *ailp) +{ + xfs_lsn_t lsn = 0; + xfs_log_item_t *lip; + + spin_lock(&ailp->xa_lock); + lip = xfs_ail_min(ailp); + if (lip) + lsn = lip->li_lsn; + spin_unlock(&ailp->xa_lock); + + return lsn; +} + +/* + * Return the maximum lsn held in the AIL, or zero if the AIL is empty. + */ +static xfs_lsn_t +xfs_ail_max_lsn( + struct xfs_ail *ailp) +{ + xfs_lsn_t lsn = 0; + xfs_log_item_t *lip; + + spin_lock(&ailp->xa_lock); + lip = xfs_ail_max(ailp); + if (lip) + lsn = lip->li_lsn; + spin_unlock(&ailp->xa_lock); + + return lsn; +} + +/* + * The cursor keeps track of where our current traversal is up to by tracking + * the next item in the list for us. However, for this to be safe, removing an + * object from the AIL needs to invalidate any cursor that points to it. hence + * the traversal cursor needs to be linked to the struct xfs_ail so that + * deletion can search all the active cursors for invalidation. + */ +STATIC void +xfs_trans_ail_cursor_init( + struct xfs_ail *ailp, + struct xfs_ail_cursor *cur) +{ + cur->item = NULL; + list_add_tail(&cur->list, &ailp->xa_cursors); +} + +/* + * Get the next item in the traversal and advance the cursor. If the cursor + * was invalidated (indicated by a lip of 1), restart the traversal. + */ +struct xfs_log_item * +xfs_trans_ail_cursor_next( + struct xfs_ail *ailp, + struct xfs_ail_cursor *cur) +{ + struct xfs_log_item *lip = cur->item; + + if ((__psint_t)lip & 1) + lip = xfs_ail_min(ailp); + if (lip) + cur->item = xfs_ail_next(ailp, lip); + return lip; +} + +/* + * When the traversal is complete, we need to remove the cursor from the list + * of traversing cursors. + */ +void +xfs_trans_ail_cursor_done( + struct xfs_ail_cursor *cur) +{ + cur->item = NULL; + list_del_init(&cur->list); +} + +/* + * Invalidate any cursor that is pointing to this item. This is called when an + * item is removed from the AIL. Any cursor pointing to this object is now + * invalid and the traversal needs to be terminated so it doesn't reference a + * freed object. We set the low bit of the cursor item pointer so we can + * distinguish between an invalidation and the end of the list when getting the + * next item from the cursor. + */ +STATIC void +xfs_trans_ail_cursor_clear( + struct xfs_ail *ailp, + struct xfs_log_item *lip) +{ + struct xfs_ail_cursor *cur; + + list_for_each_entry(cur, &ailp->xa_cursors, list) { + if (cur->item == lip) + cur->item = (struct xfs_log_item *) + ((__psint_t)cur->item | 1); + } +} + +/* + * Find the first item in the AIL with the given @lsn by searching in ascending + * LSN order and initialise the cursor to point to the next item for a + * ascending traversal. Pass a @lsn of zero to initialise the cursor to the + * first item in the AIL. Returns NULL if the list is empty. + */ +xfs_log_item_t * +xfs_trans_ail_cursor_first( + struct xfs_ail *ailp, + struct xfs_ail_cursor *cur, + xfs_lsn_t lsn) +{ + xfs_log_item_t *lip; + + xfs_trans_ail_cursor_init(ailp, cur); + + if (lsn == 0) { + lip = xfs_ail_min(ailp); + goto out; + } + + list_for_each_entry(lip, &ailp->xa_ail, li_ail) { + if (XFS_LSN_CMP(lip->li_lsn, lsn) >= 0) + goto out; + } + return NULL; + +out: + if (lip) + cur->item = xfs_ail_next(ailp, lip); + return lip; +} + +static struct xfs_log_item * +__xfs_trans_ail_cursor_last( + struct xfs_ail *ailp, + xfs_lsn_t lsn) +{ + xfs_log_item_t *lip; + + list_for_each_entry_reverse(lip, &ailp->xa_ail, li_ail) { + if (XFS_LSN_CMP(lip->li_lsn, lsn) <= 0) + return lip; + } + return NULL; +} + +/* + * Find the last item in the AIL with the given @lsn by searching in descending + * LSN order and initialise the cursor to point to that item. If there is no + * item with the value of @lsn, then it sets the cursor to the last item with an + * LSN lower than @lsn. Returns NULL if the list is empty. + */ +struct xfs_log_item * +xfs_trans_ail_cursor_last( + struct xfs_ail *ailp, + struct xfs_ail_cursor *cur, + xfs_lsn_t lsn) +{ + xfs_trans_ail_cursor_init(ailp, cur); + cur->item = __xfs_trans_ail_cursor_last(ailp, lsn); + return cur->item; +} + +/* + * Splice the log item list into the AIL at the given LSN. We splice to the + * tail of the given LSN to maintain insert order for push traversals. The + * cursor is optional, allowing repeated updates to the same LSN to avoid + * repeated traversals. This should not be called with an empty list. + */ +static void +xfs_ail_splice( + struct xfs_ail *ailp, + struct xfs_ail_cursor *cur, + struct list_head *list, + xfs_lsn_t lsn) +{ + struct xfs_log_item *lip; + + ASSERT(!list_empty(list)); + + /* + * Use the cursor to determine the insertion point if one is + * provided. If not, or if the one we got is not valid, + * find the place in the AIL where the items belong. + */ + lip = cur ? cur->item : NULL; + if (!lip || (__psint_t) lip & 1) + lip = __xfs_trans_ail_cursor_last(ailp, lsn); + + /* + * If a cursor is provided, we know we're processing the AIL + * in lsn order, and future items to be spliced in will + * follow the last one being inserted now. Update the + * cursor to point to that last item, now while we have a + * reliable pointer to it. + */ + if (cur) + cur->item = list_entry(list->prev, struct xfs_log_item, li_ail); + + /* + * Finally perform the splice. Unless the AIL was empty, + * lip points to the item in the AIL _after_ which the new + * items should go. If lip is null the AIL was empty, so + * the new items go at the head of the AIL. + */ + if (lip) + list_splice(list, &lip->li_ail); + else + list_splice(list, &ailp->xa_ail); +} + +/* + * Delete the given item from the AIL. Return a pointer to the item. + */ +static void +xfs_ail_delete( + struct xfs_ail *ailp, + xfs_log_item_t *lip) +{ + xfs_ail_check(ailp, lip); + list_del(&lip->li_ail); + xfs_trans_ail_cursor_clear(ailp, lip); +} + +static long +xfsaild_push( + struct xfs_ail *ailp) +{ + xfs_mount_t *mp = ailp->xa_mount; + struct xfs_ail_cursor cur; + xfs_log_item_t *lip; + xfs_lsn_t lsn; + xfs_lsn_t target; + long tout; + int stuck = 0; + int flushing = 0; + int count = 0; + + /* + * If we encountered pinned items or did not finish writing out all + * buffers the last time we ran, force the log first and wait for it + * before pushing again. + */ + if (ailp->xa_log_flush && ailp->xa_last_pushed_lsn == 0 && + (!list_empty_careful(&ailp->xa_buf_list) || + xfs_ail_min_lsn(ailp))) { + ailp->xa_log_flush = 0; + + XFS_STATS_INC(xs_push_ail_flush); + xfs_log_force(mp, XFS_LOG_SYNC); + } + + spin_lock(&ailp->xa_lock); + + /* barrier matches the xa_target update in xfs_ail_push() */ + smp_rmb(); + target = ailp->xa_target; + ailp->xa_target_prev = target; + + lip = xfs_trans_ail_cursor_first(ailp, &cur, ailp->xa_last_pushed_lsn); + if (!lip) { + /* + * If the AIL is empty or our push has reached the end we are + * done now. + */ + xfs_trans_ail_cursor_done(&cur); + spin_unlock(&ailp->xa_lock); + goto out_done; + } + + XFS_STATS_INC(xs_push_ail); + + lsn = lip->li_lsn; + while ((XFS_LSN_CMP(lip->li_lsn, target) <= 0)) { + int lock_result; + + /* + * Note that iop_push may unlock and reacquire the AIL lock. We + * rely on the AIL cursor implementation to be able to deal with + * the dropped lock. + */ + lock_result = lip->li_ops->iop_push(lip, &ailp->xa_buf_list); + switch (lock_result) { + case XFS_ITEM_SUCCESS: + XFS_STATS_INC(xs_push_ail_success); + trace_xfs_ail_push(lip); + + ailp->xa_last_pushed_lsn = lsn; + break; + + case XFS_ITEM_FLUSHING: + /* + * The item or its backing buffer is already beeing + * flushed. The typical reason for that is that an + * inode buffer is locked because we already pushed the + * updates to it as part of inode clustering. + * + * We do not want to to stop flushing just because lots + * of items are already beeing flushed, but we need to + * re-try the flushing relatively soon if most of the + * AIL is beeing flushed. + */ + XFS_STATS_INC(xs_push_ail_flushing); + trace_xfs_ail_flushing(lip); + + flushing++; + ailp->xa_last_pushed_lsn = lsn; + break; + + case XFS_ITEM_PINNED: + XFS_STATS_INC(xs_push_ail_pinned); + trace_xfs_ail_pinned(lip); + + stuck++; + ailp->xa_log_flush++; + break; + case XFS_ITEM_LOCKED: + XFS_STATS_INC(xs_push_ail_locked); + trace_xfs_ail_locked(lip); + + stuck++; + break; + default: + ASSERT(0); + break; + } + + count++; + + /* + * Are there too many items we can't do anything with? + * + * If we we are skipping too many items because we can't flush + * them or they are already being flushed, we back off and + * given them time to complete whatever operation is being + * done. i.e. remove pressure from the AIL while we can't make + * progress so traversals don't slow down further inserts and + * removals to/from the AIL. + * + * The value of 100 is an arbitrary magic number based on + * observation. + */ + if (stuck > 100) + break; + + lip = xfs_trans_ail_cursor_next(ailp, &cur); + if (lip == NULL) + break; + lsn = lip->li_lsn; + } + xfs_trans_ail_cursor_done(&cur); + spin_unlock(&ailp->xa_lock); + + if (xfs_buf_delwri_submit_nowait(&ailp->xa_buf_list)) + ailp->xa_log_flush++; + + if (!count || XFS_LSN_CMP(lsn, target) >= 0) { +out_done: + /* + * We reached the target or the AIL is empty, so wait a bit + * longer for I/O to complete and remove pushed items from the + * AIL before we start the next scan from the start of the AIL. + */ + tout = 50; + ailp->xa_last_pushed_lsn = 0; + } else if (((stuck + flushing) * 100) / count > 90) { + /* + * Either there is a lot of contention on the AIL or we are + * stuck due to operations in progress. "Stuck" in this case + * is defined as >90% of the items we tried to push were stuck. + * + * Backoff a bit more to allow some I/O to complete before + * restarting from the start of the AIL. This prevents us from + * spinning on the same items, and if they are pinned will all + * the restart to issue a log force to unpin the stuck items. + */ + tout = 20; + ailp->xa_last_pushed_lsn = 0; + } else { + /* + * Assume we have more work to do in a short while. + */ + tout = 10; + } + + return tout; +} + +static int +xfsaild( + void *data) +{ + struct xfs_ail *ailp = data; + long tout = 0; /* milliseconds */ + + current->flags |= PF_MEMALLOC; + + while (!kthread_should_stop()) { + if (tout && tout <= 20) + __set_current_state(TASK_KILLABLE); + else + __set_current_state(TASK_INTERRUPTIBLE); + + spin_lock(&ailp->xa_lock); + + /* + * Idle if the AIL is empty and we are not racing with a target + * update. We check the AIL after we set the task to a sleep + * state to guarantee that we either catch an xa_target update + * or that a wake_up resets the state to TASK_RUNNING. + * Otherwise, we run the risk of sleeping indefinitely. + * + * The barrier matches the xa_target update in xfs_ail_push(). + */ + smp_rmb(); + if (!xfs_ail_min(ailp) && + ailp->xa_target == ailp->xa_target_prev) { + spin_unlock(&ailp->xa_lock); + schedule(); + tout = 0; + continue; + } + spin_unlock(&ailp->xa_lock); + + if (tout) + schedule_timeout(msecs_to_jiffies(tout)); + + __set_current_state(TASK_RUNNING); + + try_to_freeze(); + + tout = xfsaild_push(ailp); + } + + return 0; +} + +/* + * This routine is called to move the tail of the AIL forward. It does this by + * trying to flush items in the AIL whose lsns are below the given + * threshold_lsn. + * + * The push is run asynchronously in a workqueue, which means the caller needs + * to handle waiting on the async flush for space to become available. + * We don't want to interrupt any push that is in progress, hence we only queue + * work if we set the pushing bit approriately. + * + * We do this unlocked - we only need to know whether there is anything in the + * AIL at the time we are called. We don't need to access the contents of + * any of the objects, so the lock is not needed. + */ +void +xfs_ail_push( + struct xfs_ail *ailp, + xfs_lsn_t threshold_lsn) +{ + xfs_log_item_t *lip; + + lip = xfs_ail_min(ailp); + if (!lip || XFS_FORCED_SHUTDOWN(ailp->xa_mount) || + XFS_LSN_CMP(threshold_lsn, ailp->xa_target) <= 0) + return; + + /* + * Ensure that the new target is noticed in push code before it clears + * the XFS_AIL_PUSHING_BIT. + */ + smp_wmb(); + xfs_trans_ail_copy_lsn(ailp, &ailp->xa_target, &threshold_lsn); + smp_wmb(); + + wake_up_process(ailp->xa_task); +} + +/* + * Push out all items in the AIL immediately + */ +void +xfs_ail_push_all( + struct xfs_ail *ailp) +{ + xfs_lsn_t threshold_lsn = xfs_ail_max_lsn(ailp); + + if (threshold_lsn) + xfs_ail_push(ailp, threshold_lsn); +} + +/* + * Push out all items in the AIL immediately and wait until the AIL is empty. + */ +void +xfs_ail_push_all_sync( + struct xfs_ail *ailp) +{ + struct xfs_log_item *lip; + DEFINE_WAIT(wait); + + spin_lock(&ailp->xa_lock); + while ((lip = xfs_ail_max(ailp)) != NULL) { + prepare_to_wait(&ailp->xa_empty, &wait, TASK_UNINTERRUPTIBLE); + ailp->xa_target = lip->li_lsn; + wake_up_process(ailp->xa_task); + spin_unlock(&ailp->xa_lock); + schedule(); + spin_lock(&ailp->xa_lock); + } + spin_unlock(&ailp->xa_lock); + + finish_wait(&ailp->xa_empty, &wait); +} + +/* + * xfs_trans_ail_update - bulk AIL insertion operation. + * + * @xfs_trans_ail_update takes an array of log items that all need to be + * positioned at the same LSN in the AIL. If an item is not in the AIL, it will + * be added. Otherwise, it will be repositioned by removing it and re-adding + * it to the AIL. If we move the first item in the AIL, update the log tail to + * match the new minimum LSN in the AIL. + * + * This function takes the AIL lock once to execute the update operations on + * all the items in the array, and as such should not be called with the AIL + * lock held. As a result, once we have the AIL lock, we need to check each log + * item LSN to confirm it needs to be moved forward in the AIL. + * + * To optimise the insert operation, we delete all the items from the AIL in + * the first pass, moving them into a temporary list, then splice the temporary + * list into the correct position in the AIL. This avoids needing to do an + * insert operation on every item. + * + * This function must be called with the AIL lock held. The lock is dropped + * before returning. + */ +void +xfs_trans_ail_update_bulk( + struct xfs_ail *ailp, + struct xfs_ail_cursor *cur, + struct xfs_log_item **log_items, + int nr_items, + xfs_lsn_t lsn) __releases(ailp->xa_lock) +{ + xfs_log_item_t *mlip; + int mlip_changed = 0; + int i; + LIST_HEAD(tmp); + + ASSERT(nr_items > 0); /* Not required, but true. */ + mlip = xfs_ail_min(ailp); + + for (i = 0; i < nr_items; i++) { + struct xfs_log_item *lip = log_items[i]; + if (lip->li_flags & XFS_LI_IN_AIL) { + /* check if we really need to move the item */ + if (XFS_LSN_CMP(lsn, lip->li_lsn) <= 0) + continue; + + trace_xfs_ail_move(lip, lip->li_lsn, lsn); + xfs_ail_delete(ailp, lip); + if (mlip == lip) + mlip_changed = 1; + } else { + lip->li_flags |= XFS_LI_IN_AIL; + trace_xfs_ail_insert(lip, 0, lsn); + } + lip->li_lsn = lsn; + list_add(&lip->li_ail, &tmp); + } + + if (!list_empty(&tmp)) + xfs_ail_splice(ailp, cur, &tmp, lsn); + + if (mlip_changed) { + if (!XFS_FORCED_SHUTDOWN(ailp->xa_mount)) + xlog_assign_tail_lsn_locked(ailp->xa_mount); + spin_unlock(&ailp->xa_lock); + + xfs_log_space_wake(ailp->xa_mount); + } else { + spin_unlock(&ailp->xa_lock); + } +} + +/* + * xfs_trans_ail_delete_bulk - remove multiple log items from the AIL + * + * @xfs_trans_ail_delete_bulk takes an array of log items that all need to + * removed from the AIL. The caller is already holding the AIL lock, and done + * all the checks necessary to ensure the items passed in via @log_items are + * ready for deletion. This includes checking that the items are in the AIL. + * + * For each log item to be removed, unlink it from the AIL, clear the IN_AIL + * flag from the item and reset the item's lsn to 0. If we remove the first + * item in the AIL, update the log tail to match the new minimum LSN in the + * AIL. + * + * This function will not drop the AIL lock until all items are removed from + * the AIL to minimise the amount of lock traffic on the AIL. This does not + * greatly increase the AIL hold time, but does significantly reduce the amount + * of traffic on the lock, especially during IO completion. + * + * This function must be called with the AIL lock held. The lock is dropped + * before returning. + */ +void +xfs_trans_ail_delete_bulk( + struct xfs_ail *ailp, + struct xfs_log_item **log_items, + int nr_items, + int shutdown_type) __releases(ailp->xa_lock) +{ + xfs_log_item_t *mlip; + int mlip_changed = 0; + int i; + + mlip = xfs_ail_min(ailp); + + for (i = 0; i < nr_items; i++) { + struct xfs_log_item *lip = log_items[i]; + if (!(lip->li_flags & XFS_LI_IN_AIL)) { + struct xfs_mount *mp = ailp->xa_mount; + + spin_unlock(&ailp->xa_lock); + if (!XFS_FORCED_SHUTDOWN(mp)) { + xfs_alert_tag(mp, XFS_PTAG_AILDELETE, + "%s: attempting to delete a log item that is not in the AIL", + __func__); + xfs_force_shutdown(mp, shutdown_type); + } + return; + } + + trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn); + xfs_ail_delete(ailp, lip); + lip->li_flags &= ~XFS_LI_IN_AIL; + lip->li_lsn = 0; + if (mlip == lip) + mlip_changed = 1; + } + + if (mlip_changed) { + if (!XFS_FORCED_SHUTDOWN(ailp->xa_mount)) + xlog_assign_tail_lsn_locked(ailp->xa_mount); + if (list_empty(&ailp->xa_ail)) + wake_up_all(&ailp->xa_empty); + spin_unlock(&ailp->xa_lock); + + xfs_log_space_wake(ailp->xa_mount); + } else { + spin_unlock(&ailp->xa_lock); + } +} + +int +xfs_trans_ail_init( + xfs_mount_t *mp) +{ + struct xfs_ail *ailp; + + ailp = kmem_zalloc(sizeof(struct xfs_ail), KM_MAYFAIL); + if (!ailp) + return -ENOMEM; + + ailp->xa_mount = mp; + INIT_LIST_HEAD(&ailp->xa_ail); + INIT_LIST_HEAD(&ailp->xa_cursors); + spin_lock_init(&ailp->xa_lock); + INIT_LIST_HEAD(&ailp->xa_buf_list); + init_waitqueue_head(&ailp->xa_empty); + + ailp->xa_task = kthread_run(xfsaild, ailp, "xfsaild/%s", + ailp->xa_mount->m_fsname); + if (IS_ERR(ailp->xa_task)) + goto out_free_ailp; + + mp->m_ail = ailp; + return 0; + +out_free_ailp: + kmem_free(ailp); + return -ENOMEM; +} + +void +xfs_trans_ail_destroy( + xfs_mount_t *mp) +{ + struct xfs_ail *ailp = mp->m_ail; + + kthread_stop(ailp->xa_task); + kmem_free(ailp); +} diff --git a/kernel/fs/xfs/xfs_trans_buf.c b/kernel/fs/xfs/xfs_trans_buf.c new file mode 100644 index 000000000..757984128 --- /dev/null +++ b/kernel/fs/xfs/xfs_trans_buf.c @@ -0,0 +1,802 @@ +/* + * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_buf_item.h" +#include "xfs_trans_priv.h" +#include "xfs_error.h" +#include "xfs_trace.h" + +/* + * Check to see if a buffer matching the given parameters is already + * a part of the given transaction. + */ +STATIC struct xfs_buf * +xfs_trans_buf_item_match( + struct xfs_trans *tp, + struct xfs_buftarg *target, + struct xfs_buf_map *map, + int nmaps) +{ + struct xfs_log_item_desc *lidp; + struct xfs_buf_log_item *blip; + int len = 0; + int i; + + for (i = 0; i < nmaps; i++) + len += map[i].bm_len; + + list_for_each_entry(lidp, &tp->t_items, lid_trans) { + blip = (struct xfs_buf_log_item *)lidp->lid_item; + if (blip->bli_item.li_type == XFS_LI_BUF && + blip->bli_buf->b_target == target && + XFS_BUF_ADDR(blip->bli_buf) == map[0].bm_bn && + blip->bli_buf->b_length == len) { + ASSERT(blip->bli_buf->b_map_count == nmaps); + return blip->bli_buf; + } + } + + return NULL; +} + +/* + * Add the locked buffer to the transaction. + * + * The buffer must be locked, and it cannot be associated with any + * transaction. + * + * If the buffer does not yet have a buf log item associated with it, + * then allocate one for it. Then add the buf item to the transaction. + */ +STATIC void +_xfs_trans_bjoin( + struct xfs_trans *tp, + struct xfs_buf *bp, + int reset_recur) +{ + struct xfs_buf_log_item *bip; + + ASSERT(bp->b_transp == NULL); + + /* + * The xfs_buf_log_item pointer is stored in b_fsprivate. If + * it doesn't have one yet, then allocate one and initialize it. + * The checks to see if one is there are in xfs_buf_item_init(). + */ + xfs_buf_item_init(bp, tp->t_mountp); + bip = bp->b_fspriv; + ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); + ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_CANCEL)); + ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED)); + if (reset_recur) + bip->bli_recur = 0; + + /* + * Take a reference for this transaction on the buf item. + */ + atomic_inc(&bip->bli_refcount); + + /* + * Get a log_item_desc to point at the new item. + */ + xfs_trans_add_item(tp, &bip->bli_item); + + /* + * Initialize b_fsprivate2 so we can find it with incore_match() + * in xfs_trans_get_buf() and friends above. + */ + bp->b_transp = tp; + +} + +void +xfs_trans_bjoin( + struct xfs_trans *tp, + struct xfs_buf *bp) +{ + _xfs_trans_bjoin(tp, bp, 0); + trace_xfs_trans_bjoin(bp->b_fspriv); +} + +/* + * Get and lock the buffer for the caller if it is not already + * locked within the given transaction. If it is already locked + * within the transaction, just increment its lock recursion count + * and return a pointer to it. + * + * If the transaction pointer is NULL, make this just a normal + * get_buf() call. + */ +struct xfs_buf * +xfs_trans_get_buf_map( + struct xfs_trans *tp, + struct xfs_buftarg *target, + struct xfs_buf_map *map, + int nmaps, + xfs_buf_flags_t flags) +{ + xfs_buf_t *bp; + xfs_buf_log_item_t *bip; + + if (!tp) + return xfs_buf_get_map(target, map, nmaps, flags); + + /* + * If we find the buffer in the cache with this transaction + * pointer in its b_fsprivate2 field, then we know we already + * have it locked. In this case we just increment the lock + * recursion count and return the buffer to the caller. + */ + bp = xfs_trans_buf_item_match(tp, target, map, nmaps); + if (bp != NULL) { + ASSERT(xfs_buf_islocked(bp)); + if (XFS_FORCED_SHUTDOWN(tp->t_mountp)) { + xfs_buf_stale(bp); + XFS_BUF_DONE(bp); + } + + ASSERT(bp->b_transp == tp); + bip = bp->b_fspriv; + ASSERT(bip != NULL); + ASSERT(atomic_read(&bip->bli_refcount) > 0); + bip->bli_recur++; + trace_xfs_trans_get_buf_recur(bip); + return bp; + } + + bp = xfs_buf_get_map(target, map, nmaps, flags); + if (bp == NULL) { + return NULL; + } + + ASSERT(!bp->b_error); + + _xfs_trans_bjoin(tp, bp, 1); + trace_xfs_trans_get_buf(bp->b_fspriv); + return bp; +} + +/* + * Get and lock the superblock buffer of this file system for the + * given transaction. + * + * We don't need to use incore_match() here, because the superblock + * buffer is a private buffer which we keep a pointer to in the + * mount structure. + */ +xfs_buf_t * +xfs_trans_getsb(xfs_trans_t *tp, + struct xfs_mount *mp, + int flags) +{ + xfs_buf_t *bp; + xfs_buf_log_item_t *bip; + + /* + * Default to just trying to lock the superblock buffer + * if tp is NULL. + */ + if (tp == NULL) + return xfs_getsb(mp, flags); + + /* + * If the superblock buffer already has this transaction + * pointer in its b_fsprivate2 field, then we know we already + * have it locked. In this case we just increment the lock + * recursion count and return the buffer to the caller. + */ + bp = mp->m_sb_bp; + if (bp->b_transp == tp) { + bip = bp->b_fspriv; + ASSERT(bip != NULL); + ASSERT(atomic_read(&bip->bli_refcount) > 0); + bip->bli_recur++; + trace_xfs_trans_getsb_recur(bip); + return bp; + } + + bp = xfs_getsb(mp, flags); + if (bp == NULL) + return NULL; + + _xfs_trans_bjoin(tp, bp, 1); + trace_xfs_trans_getsb(bp->b_fspriv); + return bp; +} + +/* + * Get and lock the buffer for the caller if it is not already + * locked within the given transaction. If it has not yet been + * read in, read it from disk. If it is already locked + * within the transaction and already read in, just increment its + * lock recursion count and return a pointer to it. + * + * If the transaction pointer is NULL, make this just a normal + * read_buf() call. + */ +int +xfs_trans_read_buf_map( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_buftarg *target, + struct xfs_buf_map *map, + int nmaps, + xfs_buf_flags_t flags, + struct xfs_buf **bpp, + const struct xfs_buf_ops *ops) +{ + struct xfs_buf *bp = NULL; + struct xfs_buf_log_item *bip; + int error; + + *bpp = NULL; + /* + * If we find the buffer in the cache with this transaction + * pointer in its b_fsprivate2 field, then we know we already + * have it locked. If it is already read in we just increment + * the lock recursion count and return the buffer to the caller. + * If the buffer is not yet read in, then we read it in, increment + * the lock recursion count, and return it to the caller. + */ + if (tp) + bp = xfs_trans_buf_item_match(tp, target, map, nmaps); + if (bp) { + ASSERT(xfs_buf_islocked(bp)); + ASSERT(bp->b_transp == tp); + ASSERT(bp->b_fspriv != NULL); + ASSERT(!bp->b_error); + ASSERT(bp->b_flags & XBF_DONE); + + /* + * We never locked this buf ourselves, so we shouldn't + * brelse it either. Just get out. + */ + if (XFS_FORCED_SHUTDOWN(mp)) { + trace_xfs_trans_read_buf_shut(bp, _RET_IP_); + return -EIO; + } + + bip = bp->b_fspriv; + bip->bli_recur++; + + ASSERT(atomic_read(&bip->bli_refcount) > 0); + trace_xfs_trans_read_buf_recur(bip); + *bpp = bp; + return 0; + } + + bp = xfs_buf_read_map(target, map, nmaps, flags, ops); + if (!bp) { + if (!(flags & XBF_TRYLOCK)) + return -ENOMEM; + return tp ? 0 : -EAGAIN; + } + + /* + * If we've had a read error, then the contents of the buffer are + * invalid and should not be used. To ensure that a followup read tries + * to pull the buffer from disk again, we clear the XBF_DONE flag and + * mark the buffer stale. This ensures that anyone who has a current + * reference to the buffer will interpret it's contents correctly and + * future cache lookups will also treat it as an empty, uninitialised + * buffer. + */ + if (bp->b_error) { + error = bp->b_error; + if (!XFS_FORCED_SHUTDOWN(mp)) + xfs_buf_ioerror_alert(bp, __func__); + bp->b_flags &= ~XBF_DONE; + xfs_buf_stale(bp); + + if (tp && (tp->t_flags & XFS_TRANS_DIRTY)) + xfs_force_shutdown(tp->t_mountp, SHUTDOWN_META_IO_ERROR); + xfs_buf_relse(bp); + + /* bad CRC means corrupted metadata */ + if (error == -EFSBADCRC) + error = -EFSCORRUPTED; + return error; + } + + if (XFS_FORCED_SHUTDOWN(mp)) { + xfs_buf_relse(bp); + trace_xfs_trans_read_buf_shut(bp, _RET_IP_); + return -EIO; + } + + if (tp) { + _xfs_trans_bjoin(tp, bp, 1); + trace_xfs_trans_read_buf(bp->b_fspriv); + } + *bpp = bp; + return 0; + +} + +/* + * Release the buffer bp which was previously acquired with one of the + * xfs_trans_... buffer allocation routines if the buffer has not + * been modified within this transaction. If the buffer is modified + * within this transaction, do decrement the recursion count but do + * not release the buffer even if the count goes to 0. If the buffer is not + * modified within the transaction, decrement the recursion count and + * release the buffer if the recursion count goes to 0. + * + * If the buffer is to be released and it was not modified before + * this transaction began, then free the buf_log_item associated with it. + * + * If the transaction pointer is NULL, make this just a normal + * brelse() call. + */ +void +xfs_trans_brelse(xfs_trans_t *tp, + xfs_buf_t *bp) +{ + xfs_buf_log_item_t *bip; + + /* + * Default to a normal brelse() call if the tp is NULL. + */ + if (tp == NULL) { + ASSERT(bp->b_transp == NULL); + xfs_buf_relse(bp); + return; + } + + ASSERT(bp->b_transp == tp); + bip = bp->b_fspriv; + ASSERT(bip->bli_item.li_type == XFS_LI_BUF); + ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); + ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_CANCEL)); + ASSERT(atomic_read(&bip->bli_refcount) > 0); + + trace_xfs_trans_brelse(bip); + + /* + * If the release is just for a recursive lock, + * then decrement the count and return. + */ + if (bip->bli_recur > 0) { + bip->bli_recur--; + return; + } + + /* + * If the buffer is dirty within this transaction, we can't + * release it until we commit. + */ + if (bip->bli_item.li_desc->lid_flags & XFS_LID_DIRTY) + return; + + /* + * If the buffer has been invalidated, then we can't release + * it until the transaction commits to disk unless it is re-dirtied + * as part of this transaction. This prevents us from pulling + * the item from the AIL before we should. + */ + if (bip->bli_flags & XFS_BLI_STALE) + return; + + ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED)); + + /* + * Free up the log item descriptor tracking the released item. + */ + xfs_trans_del_item(&bip->bli_item); + + /* + * Clear the hold flag in the buf log item if it is set. + * We wouldn't want the next user of the buffer to + * get confused. + */ + if (bip->bli_flags & XFS_BLI_HOLD) { + bip->bli_flags &= ~XFS_BLI_HOLD; + } + + /* + * Drop our reference to the buf log item. + */ + atomic_dec(&bip->bli_refcount); + + /* + * If the buf item is not tracking data in the log, then + * we must free it before releasing the buffer back to the + * free pool. Before releasing the buffer to the free pool, + * clear the transaction pointer in b_fsprivate2 to dissolve + * its relation to this transaction. + */ + if (!xfs_buf_item_dirty(bip)) { +/*** + ASSERT(bp->b_pincount == 0); +***/ + ASSERT(atomic_read(&bip->bli_refcount) == 0); + ASSERT(!(bip->bli_item.li_flags & XFS_LI_IN_AIL)); + ASSERT(!(bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF)); + xfs_buf_item_relse(bp); + } + + bp->b_transp = NULL; + xfs_buf_relse(bp); +} + +/* + * Mark the buffer as not needing to be unlocked when the buf item's + * iop_unlock() routine is called. The buffer must already be locked + * and associated with the given transaction. + */ +/* ARGSUSED */ +void +xfs_trans_bhold(xfs_trans_t *tp, + xfs_buf_t *bp) +{ + xfs_buf_log_item_t *bip = bp->b_fspriv; + + ASSERT(bp->b_transp == tp); + ASSERT(bip != NULL); + ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); + ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_CANCEL)); + ASSERT(atomic_read(&bip->bli_refcount) > 0); + + bip->bli_flags |= XFS_BLI_HOLD; + trace_xfs_trans_bhold(bip); +} + +/* + * Cancel the previous buffer hold request made on this buffer + * for this transaction. + */ +void +xfs_trans_bhold_release(xfs_trans_t *tp, + xfs_buf_t *bp) +{ + xfs_buf_log_item_t *bip = bp->b_fspriv; + + ASSERT(bp->b_transp == tp); + ASSERT(bip != NULL); + ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); + ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_CANCEL)); + ASSERT(atomic_read(&bip->bli_refcount) > 0); + ASSERT(bip->bli_flags & XFS_BLI_HOLD); + + bip->bli_flags &= ~XFS_BLI_HOLD; + trace_xfs_trans_bhold_release(bip); +} + +/* + * This is called to mark bytes first through last inclusive of the given + * buffer as needing to be logged when the transaction is committed. + * The buffer must already be associated with the given transaction. + * + * First and last are numbers relative to the beginning of this buffer, + * so the first byte in the buffer is numbered 0 regardless of the + * value of b_blkno. + */ +void +xfs_trans_log_buf(xfs_trans_t *tp, + xfs_buf_t *bp, + uint first, + uint last) +{ + xfs_buf_log_item_t *bip = bp->b_fspriv; + + ASSERT(bp->b_transp == tp); + ASSERT(bip != NULL); + ASSERT(first <= last && last < BBTOB(bp->b_length)); + ASSERT(bp->b_iodone == NULL || + bp->b_iodone == xfs_buf_iodone_callbacks); + + /* + * Mark the buffer as needing to be written out eventually, + * and set its iodone function to remove the buffer's buf log + * item from the AIL and free it when the buffer is flushed + * to disk. See xfs_buf_attach_iodone() for more details + * on li_cb and xfs_buf_iodone_callbacks(). + * If we end up aborting this transaction, we trap this buffer + * inside the b_bdstrat callback so that this won't get written to + * disk. + */ + XFS_BUF_DONE(bp); + + ASSERT(atomic_read(&bip->bli_refcount) > 0); + bp->b_iodone = xfs_buf_iodone_callbacks; + bip->bli_item.li_cb = xfs_buf_iodone; + + trace_xfs_trans_log_buf(bip); + + /* + * If we invalidated the buffer within this transaction, then + * cancel the invalidation now that we're dirtying the buffer + * again. There are no races with the code in xfs_buf_item_unpin(), + * because we have a reference to the buffer this entire time. + */ + if (bip->bli_flags & XFS_BLI_STALE) { + bip->bli_flags &= ~XFS_BLI_STALE; + ASSERT(XFS_BUF_ISSTALE(bp)); + XFS_BUF_UNSTALE(bp); + bip->__bli_format.blf_flags &= ~XFS_BLF_CANCEL; + } + + tp->t_flags |= XFS_TRANS_DIRTY; + bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY; + + /* + * If we have an ordered buffer we are not logging any dirty range but + * it still needs to be marked dirty and that it has been logged. + */ + bip->bli_flags |= XFS_BLI_DIRTY | XFS_BLI_LOGGED; + if (!(bip->bli_flags & XFS_BLI_ORDERED)) + xfs_buf_item_log(bip, first, last); +} + + +/* + * Invalidate a buffer that is being used within a transaction. + * + * Typically this is because the blocks in the buffer are being freed, so we + * need to prevent it from being written out when we're done. Allowing it + * to be written again might overwrite data in the free blocks if they are + * reallocated to a file. + * + * We prevent the buffer from being written out by marking it stale. We can't + * get rid of the buf log item at this point because the buffer may still be + * pinned by another transaction. If that is the case, then we'll wait until + * the buffer is committed to disk for the last time (we can tell by the ref + * count) and free it in xfs_buf_item_unpin(). Until that happens we will + * keep the buffer locked so that the buffer and buf log item are not reused. + * + * We also set the XFS_BLF_CANCEL flag in the buf log format structure and log + * the buf item. This will be used at recovery time to determine that copies + * of the buffer in the log before this should not be replayed. + * + * We mark the item descriptor and the transaction dirty so that we'll hold + * the buffer until after the commit. + * + * Since we're invalidating the buffer, we also clear the state about which + * parts of the buffer have been logged. We also clear the flag indicating + * that this is an inode buffer since the data in the buffer will no longer + * be valid. + * + * We set the stale bit in the buffer as well since we're getting rid of it. + */ +void +xfs_trans_binval( + xfs_trans_t *tp, + xfs_buf_t *bp) +{ + xfs_buf_log_item_t *bip = bp->b_fspriv; + int i; + + ASSERT(bp->b_transp == tp); + ASSERT(bip != NULL); + ASSERT(atomic_read(&bip->bli_refcount) > 0); + + trace_xfs_trans_binval(bip); + + if (bip->bli_flags & XFS_BLI_STALE) { + /* + * If the buffer is already invalidated, then + * just return. + */ + ASSERT(XFS_BUF_ISSTALE(bp)); + ASSERT(!(bip->bli_flags & (XFS_BLI_LOGGED | XFS_BLI_DIRTY))); + ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_INODE_BUF)); + ASSERT(!(bip->__bli_format.blf_flags & XFS_BLFT_MASK)); + ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); + ASSERT(bip->bli_item.li_desc->lid_flags & XFS_LID_DIRTY); + ASSERT(tp->t_flags & XFS_TRANS_DIRTY); + return; + } + + xfs_buf_stale(bp); + + bip->bli_flags |= XFS_BLI_STALE; + bip->bli_flags &= ~(XFS_BLI_INODE_BUF | XFS_BLI_LOGGED | XFS_BLI_DIRTY); + bip->__bli_format.blf_flags &= ~XFS_BLF_INODE_BUF; + bip->__bli_format.blf_flags |= XFS_BLF_CANCEL; + bip->__bli_format.blf_flags &= ~XFS_BLFT_MASK; + for (i = 0; i < bip->bli_format_count; i++) { + memset(bip->bli_formats[i].blf_data_map, 0, + (bip->bli_formats[i].blf_map_size * sizeof(uint))); + } + bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY; + tp->t_flags |= XFS_TRANS_DIRTY; +} + +/* + * This call is used to indicate that the buffer contains on-disk inodes which + * must be handled specially during recovery. They require special handling + * because only the di_next_unlinked from the inodes in the buffer should be + * recovered. The rest of the data in the buffer is logged via the inodes + * themselves. + * + * All we do is set the XFS_BLI_INODE_BUF flag in the items flags so it can be + * transferred to the buffer's log format structure so that we'll know what to + * do at recovery time. + */ +void +xfs_trans_inode_buf( + xfs_trans_t *tp, + xfs_buf_t *bp) +{ + xfs_buf_log_item_t *bip = bp->b_fspriv; + + ASSERT(bp->b_transp == tp); + ASSERT(bip != NULL); + ASSERT(atomic_read(&bip->bli_refcount) > 0); + + bip->bli_flags |= XFS_BLI_INODE_BUF; + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF); +} + +/* + * This call is used to indicate that the buffer is going to + * be staled and was an inode buffer. This means it gets + * special processing during unpin - where any inodes + * associated with the buffer should be removed from ail. + * There is also special processing during recovery, + * any replay of the inodes in the buffer needs to be + * prevented as the buffer may have been reused. + */ +void +xfs_trans_stale_inode_buf( + xfs_trans_t *tp, + xfs_buf_t *bp) +{ + xfs_buf_log_item_t *bip = bp->b_fspriv; + + ASSERT(bp->b_transp == tp); + ASSERT(bip != NULL); + ASSERT(atomic_read(&bip->bli_refcount) > 0); + + bip->bli_flags |= XFS_BLI_STALE_INODE; + bip->bli_item.li_cb = xfs_buf_iodone; + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF); +} + +/* + * Mark the buffer as being one which contains newly allocated + * inodes. We need to make sure that even if this buffer is + * relogged as an 'inode buf' we still recover all of the inode + * images in the face of a crash. This works in coordination with + * xfs_buf_item_committed() to ensure that the buffer remains in the + * AIL at its original location even after it has been relogged. + */ +/* ARGSUSED */ +void +xfs_trans_inode_alloc_buf( + xfs_trans_t *tp, + xfs_buf_t *bp) +{ + xfs_buf_log_item_t *bip = bp->b_fspriv; + + ASSERT(bp->b_transp == tp); + ASSERT(bip != NULL); + ASSERT(atomic_read(&bip->bli_refcount) > 0); + + bip->bli_flags |= XFS_BLI_INODE_ALLOC_BUF; + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF); +} + +/* + * Mark the buffer as ordered for this transaction. This means + * that the contents of the buffer are not recorded in the transaction + * but it is tracked in the AIL as though it was. This allows us + * to record logical changes in transactions rather than the physical + * changes we make to the buffer without changing writeback ordering + * constraints of metadata buffers. + */ +void +xfs_trans_ordered_buf( + struct xfs_trans *tp, + struct xfs_buf *bp) +{ + struct xfs_buf_log_item *bip = bp->b_fspriv; + + ASSERT(bp->b_transp == tp); + ASSERT(bip != NULL); + ASSERT(atomic_read(&bip->bli_refcount) > 0); + + bip->bli_flags |= XFS_BLI_ORDERED; + trace_xfs_buf_item_ordered(bip); +} + +/* + * Set the type of the buffer for log recovery so that it can correctly identify + * and hence attach the correct buffer ops to the buffer after replay. + */ +void +xfs_trans_buf_set_type( + struct xfs_trans *tp, + struct xfs_buf *bp, + enum xfs_blft type) +{ + struct xfs_buf_log_item *bip = bp->b_fspriv; + + if (!tp) + return; + + ASSERT(bp->b_transp == tp); + ASSERT(bip != NULL); + ASSERT(atomic_read(&bip->bli_refcount) > 0); + + xfs_blft_to_flags(&bip->__bli_format, type); +} + +void +xfs_trans_buf_copy_type( + struct xfs_buf *dst_bp, + struct xfs_buf *src_bp) +{ + struct xfs_buf_log_item *sbip = src_bp->b_fspriv; + struct xfs_buf_log_item *dbip = dst_bp->b_fspriv; + enum xfs_blft type; + + type = xfs_blft_from_flags(&sbip->__bli_format); + xfs_blft_to_flags(&dbip->__bli_format, type); +} + +/* + * Similar to xfs_trans_inode_buf(), this marks the buffer as a cluster of + * dquots. However, unlike in inode buffer recovery, dquot buffers get + * recovered in their entirety. (Hence, no XFS_BLI_DQUOT_ALLOC_BUF flag). + * The only thing that makes dquot buffers different from regular + * buffers is that we must not replay dquot bufs when recovering + * if a _corresponding_ quotaoff has happened. We also have to distinguish + * between usr dquot bufs and grp dquot bufs, because usr and grp quotas + * can be turned off independently. + */ +/* ARGSUSED */ +void +xfs_trans_dquot_buf( + xfs_trans_t *tp, + xfs_buf_t *bp, + uint type) +{ + struct xfs_buf_log_item *bip = bp->b_fspriv; + + ASSERT(type == XFS_BLF_UDQUOT_BUF || + type == XFS_BLF_PDQUOT_BUF || + type == XFS_BLF_GDQUOT_BUF); + + bip->__bli_format.blf_flags |= type; + + switch (type) { + case XFS_BLF_UDQUOT_BUF: + type = XFS_BLFT_UDQUOT_BUF; + break; + case XFS_BLF_PDQUOT_BUF: + type = XFS_BLFT_PDQUOT_BUF; + break; + case XFS_BLF_GDQUOT_BUF: + type = XFS_BLFT_GDQUOT_BUF; + break; + default: + type = XFS_BLFT_UNKNOWN_BUF; + break; + } + + xfs_trans_buf_set_type(tp, bp, type); +} diff --git a/kernel/fs/xfs/xfs_trans_dquot.c b/kernel/fs/xfs/xfs_trans_dquot.c new file mode 100644 index 000000000..76a16df55 --- /dev/null +++ b/kernel/fs/xfs/xfs_trans_dquot.c @@ -0,0 +1,887 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_error.h" +#include "xfs_trans.h" +#include "xfs_trans_priv.h" +#include "xfs_quota.h" +#include "xfs_qm.h" + +STATIC void xfs_trans_alloc_dqinfo(xfs_trans_t *); + +/* + * Add the locked dquot to the transaction. + * The dquot must be locked, and it cannot be associated with any + * transaction. + */ +void +xfs_trans_dqjoin( + xfs_trans_t *tp, + xfs_dquot_t *dqp) +{ + ASSERT(dqp->q_transp != tp); + ASSERT(XFS_DQ_IS_LOCKED(dqp)); + ASSERT(dqp->q_logitem.qli_dquot == dqp); + + /* + * Get a log_item_desc to point at the new item. + */ + xfs_trans_add_item(tp, &dqp->q_logitem.qli_item); + + /* + * Initialize d_transp so we can later determine if this dquot is + * associated with this transaction. + */ + dqp->q_transp = tp; +} + + +/* + * This is called to mark the dquot as needing + * to be logged when the transaction is committed. The dquot must + * already be associated with the given transaction. + * Note that it marks the entire transaction as dirty. In the ordinary + * case, this gets called via xfs_trans_commit, after the transaction + * is already dirty. However, there's nothing stop this from getting + * called directly, as done by xfs_qm_scall_setqlim. Hence, the TRANS_DIRTY + * flag. + */ +void +xfs_trans_log_dquot( + xfs_trans_t *tp, + xfs_dquot_t *dqp) +{ + ASSERT(dqp->q_transp == tp); + ASSERT(XFS_DQ_IS_LOCKED(dqp)); + + tp->t_flags |= XFS_TRANS_DIRTY; + dqp->q_logitem.qli_item.li_desc->lid_flags |= XFS_LID_DIRTY; +} + +/* + * Carry forward whatever is left of the quota blk reservation to + * the spanky new transaction + */ +void +xfs_trans_dup_dqinfo( + xfs_trans_t *otp, + xfs_trans_t *ntp) +{ + xfs_dqtrx_t *oq, *nq; + int i,j; + xfs_dqtrx_t *oqa, *nqa; + + if (!otp->t_dqinfo) + return; + + xfs_trans_alloc_dqinfo(ntp); + + /* + * Because the quota blk reservation is carried forward, + * it is also necessary to carry forward the DQ_DIRTY flag. + */ + if(otp->t_flags & XFS_TRANS_DQ_DIRTY) + ntp->t_flags |= XFS_TRANS_DQ_DIRTY; + + for (j = 0; j < XFS_QM_TRANS_DQTYPES; j++) { + oqa = otp->t_dqinfo->dqs[j]; + nqa = ntp->t_dqinfo->dqs[j]; + for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) { + if (oqa[i].qt_dquot == NULL) + break; + oq = &oqa[i]; + nq = &nqa[i]; + + nq->qt_dquot = oq->qt_dquot; + nq->qt_bcount_delta = nq->qt_icount_delta = 0; + nq->qt_rtbcount_delta = 0; + + /* + * Transfer whatever is left of the reservations. + */ + nq->qt_blk_res = oq->qt_blk_res - oq->qt_blk_res_used; + oq->qt_blk_res = oq->qt_blk_res_used; + + nq->qt_rtblk_res = oq->qt_rtblk_res - + oq->qt_rtblk_res_used; + oq->qt_rtblk_res = oq->qt_rtblk_res_used; + + nq->qt_ino_res = oq->qt_ino_res - oq->qt_ino_res_used; + oq->qt_ino_res = oq->qt_ino_res_used; + + } + } +} + +/* + * Wrap around mod_dquot to account for both user and group quotas. + */ +void +xfs_trans_mod_dquot_byino( + xfs_trans_t *tp, + xfs_inode_t *ip, + uint field, + long delta) +{ + xfs_mount_t *mp = tp->t_mountp; + + if (!XFS_IS_QUOTA_RUNNING(mp) || + !XFS_IS_QUOTA_ON(mp) || + xfs_is_quota_inode(&mp->m_sb, ip->i_ino)) + return; + + if (tp->t_dqinfo == NULL) + xfs_trans_alloc_dqinfo(tp); + + if (XFS_IS_UQUOTA_ON(mp) && ip->i_udquot) + (void) xfs_trans_mod_dquot(tp, ip->i_udquot, field, delta); + if (XFS_IS_GQUOTA_ON(mp) && ip->i_gdquot) + (void) xfs_trans_mod_dquot(tp, ip->i_gdquot, field, delta); + if (XFS_IS_PQUOTA_ON(mp) && ip->i_pdquot) + (void) xfs_trans_mod_dquot(tp, ip->i_pdquot, field, delta); +} + +STATIC struct xfs_dqtrx * +xfs_trans_get_dqtrx( + struct xfs_trans *tp, + struct xfs_dquot *dqp) +{ + int i; + struct xfs_dqtrx *qa; + + if (XFS_QM_ISUDQ(dqp)) + qa = tp->t_dqinfo->dqs[XFS_QM_TRANS_USR]; + else if (XFS_QM_ISGDQ(dqp)) + qa = tp->t_dqinfo->dqs[XFS_QM_TRANS_GRP]; + else if (XFS_QM_ISPDQ(dqp)) + qa = tp->t_dqinfo->dqs[XFS_QM_TRANS_PRJ]; + else + return NULL; + + for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) { + if (qa[i].qt_dquot == NULL || + qa[i].qt_dquot == dqp) + return &qa[i]; + } + + return NULL; +} + +/* + * Make the changes in the transaction structure. + * The moral equivalent to xfs_trans_mod_sb(). + * We don't touch any fields in the dquot, so we don't care + * if it's locked or not (most of the time it won't be). + */ +void +xfs_trans_mod_dquot( + xfs_trans_t *tp, + xfs_dquot_t *dqp, + uint field, + long delta) +{ + xfs_dqtrx_t *qtrx; + + ASSERT(tp); + ASSERT(XFS_IS_QUOTA_RUNNING(tp->t_mountp)); + qtrx = NULL; + + if (tp->t_dqinfo == NULL) + xfs_trans_alloc_dqinfo(tp); + /* + * Find either the first free slot or the slot that belongs + * to this dquot. + */ + qtrx = xfs_trans_get_dqtrx(tp, dqp); + ASSERT(qtrx); + if (qtrx->qt_dquot == NULL) + qtrx->qt_dquot = dqp; + + switch (field) { + + /* + * regular disk blk reservation + */ + case XFS_TRANS_DQ_RES_BLKS: + qtrx->qt_blk_res += (ulong)delta; + break; + + /* + * inode reservation + */ + case XFS_TRANS_DQ_RES_INOS: + qtrx->qt_ino_res += (ulong)delta; + break; + + /* + * disk blocks used. + */ + case XFS_TRANS_DQ_BCOUNT: + if (qtrx->qt_blk_res && delta > 0) { + qtrx->qt_blk_res_used += (ulong)delta; + ASSERT(qtrx->qt_blk_res >= qtrx->qt_blk_res_used); + } + qtrx->qt_bcount_delta += delta; + break; + + case XFS_TRANS_DQ_DELBCOUNT: + qtrx->qt_delbcnt_delta += delta; + break; + + /* + * Inode Count + */ + case XFS_TRANS_DQ_ICOUNT: + if (qtrx->qt_ino_res && delta > 0) { + qtrx->qt_ino_res_used += (ulong)delta; + ASSERT(qtrx->qt_ino_res >= qtrx->qt_ino_res_used); + } + qtrx->qt_icount_delta += delta; + break; + + /* + * rtblk reservation + */ + case XFS_TRANS_DQ_RES_RTBLKS: + qtrx->qt_rtblk_res += (ulong)delta; + break; + + /* + * rtblk count + */ + case XFS_TRANS_DQ_RTBCOUNT: + if (qtrx->qt_rtblk_res && delta > 0) { + qtrx->qt_rtblk_res_used += (ulong)delta; + ASSERT(qtrx->qt_rtblk_res >= qtrx->qt_rtblk_res_used); + } + qtrx->qt_rtbcount_delta += delta; + break; + + case XFS_TRANS_DQ_DELRTBCOUNT: + qtrx->qt_delrtb_delta += delta; + break; + + default: + ASSERT(0); + } + tp->t_flags |= XFS_TRANS_DQ_DIRTY; +} + + +/* + * Given an array of dqtrx structures, lock all the dquots associated and join + * them to the transaction, provided they have been modified. We know that the + * highest number of dquots of one type - usr, grp and prj - involved in a + * transaction is 3 so we don't need to make this very generic. + */ +STATIC void +xfs_trans_dqlockedjoin( + xfs_trans_t *tp, + xfs_dqtrx_t *q) +{ + ASSERT(q[0].qt_dquot != NULL); + if (q[1].qt_dquot == NULL) { + xfs_dqlock(q[0].qt_dquot); + xfs_trans_dqjoin(tp, q[0].qt_dquot); + } else { + ASSERT(XFS_QM_TRANS_MAXDQS == 2); + xfs_dqlock2(q[0].qt_dquot, q[1].qt_dquot); + xfs_trans_dqjoin(tp, q[0].qt_dquot); + xfs_trans_dqjoin(tp, q[1].qt_dquot); + } +} + + +/* + * Called by xfs_trans_commit() and similar in spirit to + * xfs_trans_apply_sb_deltas(). + * Go thru all the dquots belonging to this transaction and modify the + * INCORE dquot to reflect the actual usages. + * Unreserve just the reservations done by this transaction. + * dquot is still left locked at exit. + */ +void +xfs_trans_apply_dquot_deltas( + struct xfs_trans *tp) +{ + int i, j; + struct xfs_dquot *dqp; + struct xfs_dqtrx *qtrx, *qa; + struct xfs_disk_dquot *d; + long totalbdelta; + long totalrtbdelta; + + if (!(tp->t_flags & XFS_TRANS_DQ_DIRTY)) + return; + + ASSERT(tp->t_dqinfo); + for (j = 0; j < XFS_QM_TRANS_DQTYPES; j++) { + qa = tp->t_dqinfo->dqs[j]; + if (qa[0].qt_dquot == NULL) + continue; + + /* + * Lock all of the dquots and join them to the transaction. + */ + xfs_trans_dqlockedjoin(tp, qa); + + for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) { + qtrx = &qa[i]; + /* + * The array of dquots is filled + * sequentially, not sparsely. + */ + if ((dqp = qtrx->qt_dquot) == NULL) + break; + + ASSERT(XFS_DQ_IS_LOCKED(dqp)); + ASSERT(dqp->q_transp == tp); + + /* + * adjust the actual number of blocks used + */ + d = &dqp->q_core; + + /* + * The issue here is - sometimes we don't make a blkquota + * reservation intentionally to be fair to users + * (when the amount is small). On the other hand, + * delayed allocs do make reservations, but that's + * outside of a transaction, so we have no + * idea how much was really reserved. + * So, here we've accumulated delayed allocation blks and + * non-delay blks. The assumption is that the + * delayed ones are always reserved (outside of a + * transaction), and the others may or may not have + * quota reservations. + */ + totalbdelta = qtrx->qt_bcount_delta + + qtrx->qt_delbcnt_delta; + totalrtbdelta = qtrx->qt_rtbcount_delta + + qtrx->qt_delrtb_delta; +#ifdef DEBUG + if (totalbdelta < 0) + ASSERT(be64_to_cpu(d->d_bcount) >= + -totalbdelta); + + if (totalrtbdelta < 0) + ASSERT(be64_to_cpu(d->d_rtbcount) >= + -totalrtbdelta); + + if (qtrx->qt_icount_delta < 0) + ASSERT(be64_to_cpu(d->d_icount) >= + -qtrx->qt_icount_delta); +#endif + if (totalbdelta) + be64_add_cpu(&d->d_bcount, (xfs_qcnt_t)totalbdelta); + + if (qtrx->qt_icount_delta) + be64_add_cpu(&d->d_icount, (xfs_qcnt_t)qtrx->qt_icount_delta); + + if (totalrtbdelta) + be64_add_cpu(&d->d_rtbcount, (xfs_qcnt_t)totalrtbdelta); + + /* + * Get any default limits in use. + * Start/reset the timer(s) if needed. + */ + if (d->d_id) { + xfs_qm_adjust_dqlimits(tp->t_mountp, dqp); + xfs_qm_adjust_dqtimers(tp->t_mountp, d); + } + + dqp->dq_flags |= XFS_DQ_DIRTY; + /* + * add this to the list of items to get logged + */ + xfs_trans_log_dquot(tp, dqp); + /* + * Take off what's left of the original reservation. + * In case of delayed allocations, there's no + * reservation that a transaction structure knows of. + */ + if (qtrx->qt_blk_res != 0) { + if (qtrx->qt_blk_res != qtrx->qt_blk_res_used) { + if (qtrx->qt_blk_res > + qtrx->qt_blk_res_used) + dqp->q_res_bcount -= (xfs_qcnt_t) + (qtrx->qt_blk_res - + qtrx->qt_blk_res_used); + else + dqp->q_res_bcount -= (xfs_qcnt_t) + (qtrx->qt_blk_res_used - + qtrx->qt_blk_res); + } + } else { + /* + * These blks were never reserved, either inside + * a transaction or outside one (in a delayed + * allocation). Also, this isn't always a + * negative number since we sometimes + * deliberately skip quota reservations. + */ + if (qtrx->qt_bcount_delta) { + dqp->q_res_bcount += + (xfs_qcnt_t)qtrx->qt_bcount_delta; + } + } + /* + * Adjust the RT reservation. + */ + if (qtrx->qt_rtblk_res != 0) { + if (qtrx->qt_rtblk_res != qtrx->qt_rtblk_res_used) { + if (qtrx->qt_rtblk_res > + qtrx->qt_rtblk_res_used) + dqp->q_res_rtbcount -= (xfs_qcnt_t) + (qtrx->qt_rtblk_res - + qtrx->qt_rtblk_res_used); + else + dqp->q_res_rtbcount -= (xfs_qcnt_t) + (qtrx->qt_rtblk_res_used - + qtrx->qt_rtblk_res); + } + } else { + if (qtrx->qt_rtbcount_delta) + dqp->q_res_rtbcount += + (xfs_qcnt_t)qtrx->qt_rtbcount_delta; + } + + /* + * Adjust the inode reservation. + */ + if (qtrx->qt_ino_res != 0) { + ASSERT(qtrx->qt_ino_res >= + qtrx->qt_ino_res_used); + if (qtrx->qt_ino_res > qtrx->qt_ino_res_used) + dqp->q_res_icount -= (xfs_qcnt_t) + (qtrx->qt_ino_res - + qtrx->qt_ino_res_used); + } else { + if (qtrx->qt_icount_delta) + dqp->q_res_icount += + (xfs_qcnt_t)qtrx->qt_icount_delta; + } + + ASSERT(dqp->q_res_bcount >= + be64_to_cpu(dqp->q_core.d_bcount)); + ASSERT(dqp->q_res_icount >= + be64_to_cpu(dqp->q_core.d_icount)); + ASSERT(dqp->q_res_rtbcount >= + be64_to_cpu(dqp->q_core.d_rtbcount)); + } + } +} + +/* + * Release the reservations, and adjust the dquots accordingly. + * This is called only when the transaction is being aborted. If by + * any chance we have done dquot modifications incore (ie. deltas) already, + * we simply throw those away, since that's the expected behavior + * when a transaction is curtailed without a commit. + */ +void +xfs_trans_unreserve_and_mod_dquots( + xfs_trans_t *tp) +{ + int i, j; + xfs_dquot_t *dqp; + xfs_dqtrx_t *qtrx, *qa; + bool locked; + + if (!tp->t_dqinfo || !(tp->t_flags & XFS_TRANS_DQ_DIRTY)) + return; + + for (j = 0; j < XFS_QM_TRANS_DQTYPES; j++) { + qa = tp->t_dqinfo->dqs[j]; + + for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) { + qtrx = &qa[i]; + /* + * We assume that the array of dquots is filled + * sequentially, not sparsely. + */ + if ((dqp = qtrx->qt_dquot) == NULL) + break; + /* + * Unreserve the original reservation. We don't care + * about the number of blocks used field, or deltas. + * Also we don't bother to zero the fields. + */ + locked = false; + if (qtrx->qt_blk_res) { + xfs_dqlock(dqp); + locked = true; + dqp->q_res_bcount -= + (xfs_qcnt_t)qtrx->qt_blk_res; + } + if (qtrx->qt_ino_res) { + if (!locked) { + xfs_dqlock(dqp); + locked = true; + } + dqp->q_res_icount -= + (xfs_qcnt_t)qtrx->qt_ino_res; + } + + if (qtrx->qt_rtblk_res) { + if (!locked) { + xfs_dqlock(dqp); + locked = true; + } + dqp->q_res_rtbcount -= + (xfs_qcnt_t)qtrx->qt_rtblk_res; + } + if (locked) + xfs_dqunlock(dqp); + + } + } +} + +STATIC void +xfs_quota_warn( + struct xfs_mount *mp, + struct xfs_dquot *dqp, + int type) +{ + /* no warnings for project quotas - we just return ENOSPC later */ + if (dqp->dq_flags & XFS_DQ_PROJ) + return; + quota_send_warning(make_kqid(&init_user_ns, + (dqp->dq_flags & XFS_DQ_USER) ? + USRQUOTA : GRPQUOTA, + be32_to_cpu(dqp->q_core.d_id)), + mp->m_super->s_dev, type); +} + +/* + * This reserves disk blocks and inodes against a dquot. + * Flags indicate if the dquot is to be locked here and also + * if the blk reservation is for RT or regular blocks. + * Sending in XFS_QMOPT_FORCE_RES flag skips the quota check. + */ +STATIC int +xfs_trans_dqresv( + xfs_trans_t *tp, + xfs_mount_t *mp, + xfs_dquot_t *dqp, + long nblks, + long ninos, + uint flags) +{ + xfs_qcnt_t hardlimit; + xfs_qcnt_t softlimit; + time_t timer; + xfs_qwarncnt_t warns; + xfs_qwarncnt_t warnlimit; + xfs_qcnt_t total_count; + xfs_qcnt_t *resbcountp; + xfs_quotainfo_t *q = mp->m_quotainfo; + + + xfs_dqlock(dqp); + + if (flags & XFS_TRANS_DQ_RES_BLKS) { + hardlimit = be64_to_cpu(dqp->q_core.d_blk_hardlimit); + if (!hardlimit) + hardlimit = q->qi_bhardlimit; + softlimit = be64_to_cpu(dqp->q_core.d_blk_softlimit); + if (!softlimit) + softlimit = q->qi_bsoftlimit; + timer = be32_to_cpu(dqp->q_core.d_btimer); + warns = be16_to_cpu(dqp->q_core.d_bwarns); + warnlimit = dqp->q_mount->m_quotainfo->qi_bwarnlimit; + resbcountp = &dqp->q_res_bcount; + } else { + ASSERT(flags & XFS_TRANS_DQ_RES_RTBLKS); + hardlimit = be64_to_cpu(dqp->q_core.d_rtb_hardlimit); + if (!hardlimit) + hardlimit = q->qi_rtbhardlimit; + softlimit = be64_to_cpu(dqp->q_core.d_rtb_softlimit); + if (!softlimit) + softlimit = q->qi_rtbsoftlimit; + timer = be32_to_cpu(dqp->q_core.d_rtbtimer); + warns = be16_to_cpu(dqp->q_core.d_rtbwarns); + warnlimit = dqp->q_mount->m_quotainfo->qi_rtbwarnlimit; + resbcountp = &dqp->q_res_rtbcount; + } + + if ((flags & XFS_QMOPT_FORCE_RES) == 0 && + dqp->q_core.d_id && + ((XFS_IS_UQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISUDQ(dqp)) || + (XFS_IS_GQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISGDQ(dqp)) || + (XFS_IS_PQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISPDQ(dqp)))) { + if (nblks > 0) { + /* + * dquot is locked already. See if we'd go over the + * hardlimit or exceed the timelimit if we allocate + * nblks. + */ + total_count = *resbcountp + nblks; + if (hardlimit && total_count > hardlimit) { + xfs_quota_warn(mp, dqp, QUOTA_NL_BHARDWARN); + goto error_return; + } + if (softlimit && total_count > softlimit) { + if ((timer != 0 && get_seconds() > timer) || + (warns != 0 && warns >= warnlimit)) { + xfs_quota_warn(mp, dqp, + QUOTA_NL_BSOFTLONGWARN); + goto error_return; + } + + xfs_quota_warn(mp, dqp, QUOTA_NL_BSOFTWARN); + } + } + if (ninos > 0) { + total_count = be64_to_cpu(dqp->q_core.d_icount) + ninos; + timer = be32_to_cpu(dqp->q_core.d_itimer); + warns = be16_to_cpu(dqp->q_core.d_iwarns); + warnlimit = dqp->q_mount->m_quotainfo->qi_iwarnlimit; + hardlimit = be64_to_cpu(dqp->q_core.d_ino_hardlimit); + if (!hardlimit) + hardlimit = q->qi_ihardlimit; + softlimit = be64_to_cpu(dqp->q_core.d_ino_softlimit); + if (!softlimit) + softlimit = q->qi_isoftlimit; + + if (hardlimit && total_count > hardlimit) { + xfs_quota_warn(mp, dqp, QUOTA_NL_IHARDWARN); + goto error_return; + } + if (softlimit && total_count > softlimit) { + if ((timer != 0 && get_seconds() > timer) || + (warns != 0 && warns >= warnlimit)) { + xfs_quota_warn(mp, dqp, + QUOTA_NL_ISOFTLONGWARN); + goto error_return; + } + xfs_quota_warn(mp, dqp, QUOTA_NL_ISOFTWARN); + } + } + } + + /* + * Change the reservation, but not the actual usage. + * Note that q_res_bcount = q_core.d_bcount + resv + */ + (*resbcountp) += (xfs_qcnt_t)nblks; + if (ninos != 0) + dqp->q_res_icount += (xfs_qcnt_t)ninos; + + /* + * note the reservation amt in the trans struct too, + * so that the transaction knows how much was reserved by + * it against this particular dquot. + * We don't do this when we are reserving for a delayed allocation, + * because we don't have the luxury of a transaction envelope then. + */ + if (tp) { + ASSERT(tp->t_dqinfo); + ASSERT(flags & XFS_QMOPT_RESBLK_MASK); + if (nblks != 0) + xfs_trans_mod_dquot(tp, dqp, + flags & XFS_QMOPT_RESBLK_MASK, + nblks); + if (ninos != 0) + xfs_trans_mod_dquot(tp, dqp, + XFS_TRANS_DQ_RES_INOS, + ninos); + } + ASSERT(dqp->q_res_bcount >= be64_to_cpu(dqp->q_core.d_bcount)); + ASSERT(dqp->q_res_rtbcount >= be64_to_cpu(dqp->q_core.d_rtbcount)); + ASSERT(dqp->q_res_icount >= be64_to_cpu(dqp->q_core.d_icount)); + + xfs_dqunlock(dqp); + return 0; + +error_return: + xfs_dqunlock(dqp); + if (flags & XFS_QMOPT_ENOSPC) + return -ENOSPC; + return -EDQUOT; +} + + +/* + * Given dquot(s), make disk block and/or inode reservations against them. + * The fact that this does the reservation against user, group and + * project quotas is important, because this follows a all-or-nothing + * approach. + * + * flags = XFS_QMOPT_FORCE_RES evades limit enforcement. Used by chown. + * XFS_QMOPT_ENOSPC returns ENOSPC not EDQUOT. Used by pquota. + * XFS_TRANS_DQ_RES_BLKS reserves regular disk blocks + * XFS_TRANS_DQ_RES_RTBLKS reserves realtime disk blocks + * dquots are unlocked on return, if they were not locked by caller. + */ +int +xfs_trans_reserve_quota_bydquots( + struct xfs_trans *tp, + struct xfs_mount *mp, + struct xfs_dquot *udqp, + struct xfs_dquot *gdqp, + struct xfs_dquot *pdqp, + long nblks, + long ninos, + uint flags) +{ + int error; + + if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp)) + return 0; + + if (tp && tp->t_dqinfo == NULL) + xfs_trans_alloc_dqinfo(tp); + + ASSERT(flags & XFS_QMOPT_RESBLK_MASK); + + if (udqp) { + error = xfs_trans_dqresv(tp, mp, udqp, nblks, ninos, + (flags & ~XFS_QMOPT_ENOSPC)); + if (error) + return error; + } + + if (gdqp) { + error = xfs_trans_dqresv(tp, mp, gdqp, nblks, ninos, flags); + if (error) + goto unwind_usr; + } + + if (pdqp) { + error = xfs_trans_dqresv(tp, mp, pdqp, nblks, ninos, flags); + if (error) + goto unwind_grp; + } + + /* + * Didn't change anything critical, so, no need to log + */ + return 0; + +unwind_grp: + flags |= XFS_QMOPT_FORCE_RES; + if (gdqp) + xfs_trans_dqresv(tp, mp, gdqp, -nblks, -ninos, flags); +unwind_usr: + flags |= XFS_QMOPT_FORCE_RES; + if (udqp) + xfs_trans_dqresv(tp, mp, udqp, -nblks, -ninos, flags); + return error; +} + + +/* + * Lock the dquot and change the reservation if we can. + * This doesn't change the actual usage, just the reservation. + * The inode sent in is locked. + */ +int +xfs_trans_reserve_quota_nblks( + struct xfs_trans *tp, + struct xfs_inode *ip, + long nblks, + long ninos, + uint flags) +{ + struct xfs_mount *mp = ip->i_mount; + + if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp)) + return 0; + if (XFS_IS_PQUOTA_ON(mp)) + flags |= XFS_QMOPT_ENOSPC; + + ASSERT(!xfs_is_quota_inode(&mp->m_sb, ip->i_ino)); + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + ASSERT((flags & ~(XFS_QMOPT_FORCE_RES | XFS_QMOPT_ENOSPC)) == + XFS_TRANS_DQ_RES_RTBLKS || + (flags & ~(XFS_QMOPT_FORCE_RES | XFS_QMOPT_ENOSPC)) == + XFS_TRANS_DQ_RES_BLKS); + + /* + * Reserve nblks against these dquots, with trans as the mediator. + */ + return xfs_trans_reserve_quota_bydquots(tp, mp, + ip->i_udquot, ip->i_gdquot, + ip->i_pdquot, + nblks, ninos, flags); +} + +/* + * This routine is called to allocate a quotaoff log item. + */ +xfs_qoff_logitem_t * +xfs_trans_get_qoff_item( + xfs_trans_t *tp, + xfs_qoff_logitem_t *startqoff, + uint flags) +{ + xfs_qoff_logitem_t *q; + + ASSERT(tp != NULL); + + q = xfs_qm_qoff_logitem_init(tp->t_mountp, startqoff, flags); + ASSERT(q != NULL); + + /* + * Get a log_item_desc to point at the new item. + */ + xfs_trans_add_item(tp, &q->qql_item); + return q; +} + + +/* + * This is called to mark the quotaoff logitem as needing + * to be logged when the transaction is committed. The logitem must + * already be associated with the given transaction. + */ +void +xfs_trans_log_quotaoff_item( + xfs_trans_t *tp, + xfs_qoff_logitem_t *qlp) +{ + tp->t_flags |= XFS_TRANS_DIRTY; + qlp->qql_item.li_desc->lid_flags |= XFS_LID_DIRTY; +} + +STATIC void +xfs_trans_alloc_dqinfo( + xfs_trans_t *tp) +{ + tp->t_dqinfo = kmem_zone_zalloc(xfs_qm_dqtrxzone, KM_SLEEP); +} + +void +xfs_trans_free_dqinfo( + xfs_trans_t *tp) +{ + if (!tp->t_dqinfo) + return; + kmem_zone_free(xfs_qm_dqtrxzone, tp->t_dqinfo); + tp->t_dqinfo = NULL; +} diff --git a/kernel/fs/xfs/xfs_trans_extfree.c b/kernel/fs/xfs/xfs_trans_extfree.c new file mode 100644 index 000000000..284397dd7 --- /dev/null +++ b/kernel/fs/xfs/xfs_trans_extfree.c @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2000,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_trans.h" +#include "xfs_trans_priv.h" +#include "xfs_extfree_item.h" + +/* + * This routine is called to allocate an "extent free intention" + * log item that will hold nextents worth of extents. The + * caller must use all nextents extents, because we are not + * flexible about this at all. + */ +xfs_efi_log_item_t * +xfs_trans_get_efi(xfs_trans_t *tp, + uint nextents) +{ + xfs_efi_log_item_t *efip; + + ASSERT(tp != NULL); + ASSERT(nextents > 0); + + efip = xfs_efi_init(tp->t_mountp, nextents); + ASSERT(efip != NULL); + + /* + * Get a log_item_desc to point at the new item. + */ + xfs_trans_add_item(tp, &efip->efi_item); + return efip; +} + +/* + * This routine is called to indicate that the described + * extent is to be logged as needing to be freed. It should + * be called once for each extent to be freed. + */ +void +xfs_trans_log_efi_extent(xfs_trans_t *tp, + xfs_efi_log_item_t *efip, + xfs_fsblock_t start_block, + xfs_extlen_t ext_len) +{ + uint next_extent; + xfs_extent_t *extp; + + tp->t_flags |= XFS_TRANS_DIRTY; + efip->efi_item.li_desc->lid_flags |= XFS_LID_DIRTY; + + /* + * atomic_inc_return gives us the value after the increment; + * we want to use it as an array index so we need to subtract 1 from + * it. + */ + next_extent = atomic_inc_return(&efip->efi_next_extent) - 1; + ASSERT(next_extent < efip->efi_format.efi_nextents); + extp = &(efip->efi_format.efi_extents[next_extent]); + extp->ext_start = start_block; + extp->ext_len = ext_len; +} + + +/* + * This routine is called to allocate an "extent free done" + * log item that will hold nextents worth of extents. The + * caller must use all nextents extents, because we are not + * flexible about this at all. + */ +xfs_efd_log_item_t * +xfs_trans_get_efd(xfs_trans_t *tp, + xfs_efi_log_item_t *efip, + uint nextents) +{ + xfs_efd_log_item_t *efdp; + + ASSERT(tp != NULL); + ASSERT(nextents > 0); + + efdp = xfs_efd_init(tp->t_mountp, efip, nextents); + ASSERT(efdp != NULL); + + /* + * Get a log_item_desc to point at the new item. + */ + xfs_trans_add_item(tp, &efdp->efd_item); + return efdp; +} + +/* + * This routine is called to indicate that the described + * extent is to be logged as having been freed. It should + * be called once for each extent freed. + */ +void +xfs_trans_log_efd_extent(xfs_trans_t *tp, + xfs_efd_log_item_t *efdp, + xfs_fsblock_t start_block, + xfs_extlen_t ext_len) +{ + uint next_extent; + xfs_extent_t *extp; + + tp->t_flags |= XFS_TRANS_DIRTY; + efdp->efd_item.li_desc->lid_flags |= XFS_LID_DIRTY; + + next_extent = efdp->efd_next_extent; + ASSERT(next_extent < efdp->efd_format.efd_nextents); + extp = &(efdp->efd_format.efd_extents[next_extent]); + extp->ext_start = start_block; + extp->ext_len = ext_len; + efdp->efd_next_extent++; +} diff --git a/kernel/fs/xfs/xfs_trans_inode.c b/kernel/fs/xfs/xfs_trans_inode.c new file mode 100644 index 000000000..17280cd71 --- /dev/null +++ b/kernel/fs/xfs/xfs_trans_inode.c @@ -0,0 +1,135 @@ +/* + * Copyright (c) 2000,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_trans_priv.h" +#include "xfs_inode_item.h" +#include "xfs_trace.h" + +/* + * Add a locked inode to the transaction. + * + * The inode must be locked, and it cannot be associated with any transaction. + * If lock_flags is non-zero the inode will be unlocked on transaction commit. + */ +void +xfs_trans_ijoin( + struct xfs_trans *tp, + struct xfs_inode *ip, + uint lock_flags) +{ + xfs_inode_log_item_t *iip; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + if (ip->i_itemp == NULL) + xfs_inode_item_init(ip, ip->i_mount); + iip = ip->i_itemp; + + ASSERT(iip->ili_lock_flags == 0); + iip->ili_lock_flags = lock_flags; + + /* + * Get a log_item_desc to point at the new item. + */ + xfs_trans_add_item(tp, &iip->ili_item); +} + +/* + * Transactional inode timestamp update. Requires the inode to be locked and + * joined to the transaction supplied. Relies on the transaction subsystem to + * track dirty state and update/writeback the inode accordingly. + */ +void +xfs_trans_ichgtime( + struct xfs_trans *tp, + struct xfs_inode *ip, + int flags) +{ + struct inode *inode = VFS_I(ip); + struct timespec tv; + + ASSERT(tp); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + + tv = current_fs_time(inode->i_sb); + + if ((flags & XFS_ICHGTIME_MOD) && + !timespec_equal(&inode->i_mtime, &tv)) { + inode->i_mtime = tv; + ip->i_d.di_mtime.t_sec = tv.tv_sec; + ip->i_d.di_mtime.t_nsec = tv.tv_nsec; + } + if ((flags & XFS_ICHGTIME_CHG) && + !timespec_equal(&inode->i_ctime, &tv)) { + inode->i_ctime = tv; + ip->i_d.di_ctime.t_sec = tv.tv_sec; + ip->i_d.di_ctime.t_nsec = tv.tv_nsec; + } +} + +/* + * This is called to mark the fields indicated in fieldmask as needing + * to be logged when the transaction is committed. The inode must + * already be associated with the given transaction. + * + * The values for fieldmask are defined in xfs_inode_item.h. We always + * log all of the core inode if any of it has changed, and we always log + * all of the inline data/extents/b-tree root if any of them has changed. + */ +void +xfs_trans_log_inode( + xfs_trans_t *tp, + xfs_inode_t *ip, + uint flags) +{ + ASSERT(ip->i_itemp != NULL); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + + /* + * First time we log the inode in a transaction, bump the inode change + * counter if it is configured for this to occur. We don't use + * inode_inc_version() because there is no need for extra locking around + * i_version as we already hold the inode locked exclusively for + * metadata modification. + */ + if (!(ip->i_itemp->ili_item.li_desc->lid_flags & XFS_LID_DIRTY) && + IS_I_VERSION(VFS_I(ip))) { + ip->i_d.di_changecount = ++VFS_I(ip)->i_version; + flags |= XFS_ILOG_CORE; + } + + tp->t_flags |= XFS_TRANS_DIRTY; + ip->i_itemp->ili_item.li_desc->lid_flags |= XFS_LID_DIRTY; + + /* + * Always OR in the bits from the ili_last_fields field. + * This is to coordinate with the xfs_iflush() and xfs_iflush_done() + * routines in the eventual clearing of the ili_fields bits. + * See the big comment in xfs_iflush() for an explanation of + * this coordination mechanism. + */ + flags |= ip->i_itemp->ili_last_fields; + ip->i_itemp->ili_fields |= flags; +} diff --git a/kernel/fs/xfs/xfs_trans_priv.h b/kernel/fs/xfs/xfs_trans_priv.h new file mode 100644 index 000000000..bd1281862 --- /dev/null +++ b/kernel/fs/xfs/xfs_trans_priv.h @@ -0,0 +1,161 @@ +/* + * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_TRANS_PRIV_H__ +#define __XFS_TRANS_PRIV_H__ + +struct xfs_log_item; +struct xfs_log_item_desc; +struct xfs_mount; +struct xfs_trans; +struct xfs_ail; +struct xfs_log_vec; + + +void xfs_trans_init(struct xfs_mount *); +void xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *); +void xfs_trans_del_item(struct xfs_log_item *); +void xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn, + int flags); +void xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp); + +void xfs_trans_committed_bulk(struct xfs_ail *ailp, struct xfs_log_vec *lv, + xfs_lsn_t commit_lsn, int aborted); +/* + * AIL traversal cursor. + * + * Rather than using a generation number for detecting changes in the ail, use + * a cursor that is protected by the ail lock. The aild cursor exists in the + * struct xfs_ail, but other traversals can declare it on the stack and link it + * to the ail list. + * + * When an object is deleted from or moved int the AIL, the cursor list is + * searched to see if the object is a designated cursor item. If it is, it is + * deleted from the cursor so that the next time the cursor is used traversal + * will return to the start. + * + * This means a traversal colliding with a removal will cause a restart of the + * list scan, rather than any insertion or deletion anywhere in the list. The + * low bit of the item pointer is set if the cursor has been invalidated so + * that we can tell the difference between invalidation and reaching the end + * of the list to trigger traversal restarts. + */ +struct xfs_ail_cursor { + struct list_head list; + struct xfs_log_item *item; +}; + +/* + * Private AIL structures. + * + * Eventually we need to drive the locking in here as well. + */ +struct xfs_ail { + struct xfs_mount *xa_mount; + struct task_struct *xa_task; + struct list_head xa_ail; + xfs_lsn_t xa_target; + xfs_lsn_t xa_target_prev; + struct list_head xa_cursors; + spinlock_t xa_lock; + xfs_lsn_t xa_last_pushed_lsn; + int xa_log_flush; + struct list_head xa_buf_list; + wait_queue_head_t xa_empty; +}; + +/* + * From xfs_trans_ail.c + */ +void xfs_trans_ail_update_bulk(struct xfs_ail *ailp, + struct xfs_ail_cursor *cur, + struct xfs_log_item **log_items, int nr_items, + xfs_lsn_t lsn) __releases(ailp->xa_lock); +/* + * Return a pointer to the first item in the AIL. If the AIL is empty, then + * return NULL. + */ +static inline struct xfs_log_item * +xfs_ail_min( + struct xfs_ail *ailp) +{ + return list_first_entry_or_null(&ailp->xa_ail, struct xfs_log_item, + li_ail); +} + +static inline void +xfs_trans_ail_update( + struct xfs_ail *ailp, + struct xfs_log_item *lip, + xfs_lsn_t lsn) __releases(ailp->xa_lock) +{ + xfs_trans_ail_update_bulk(ailp, NULL, &lip, 1, lsn); +} + +void xfs_trans_ail_delete_bulk(struct xfs_ail *ailp, + struct xfs_log_item **log_items, int nr_items, + int shutdown_type) + __releases(ailp->xa_lock); +static inline void +xfs_trans_ail_delete( + struct xfs_ail *ailp, + xfs_log_item_t *lip, + int shutdown_type) __releases(ailp->xa_lock) +{ + xfs_trans_ail_delete_bulk(ailp, &lip, 1, shutdown_type); +} + +void xfs_ail_push(struct xfs_ail *, xfs_lsn_t); +void xfs_ail_push_all(struct xfs_ail *); +void xfs_ail_push_all_sync(struct xfs_ail *); +struct xfs_log_item *xfs_ail_min(struct xfs_ail *ailp); +xfs_lsn_t xfs_ail_min_lsn(struct xfs_ail *ailp); + +struct xfs_log_item * xfs_trans_ail_cursor_first(struct xfs_ail *ailp, + struct xfs_ail_cursor *cur, + xfs_lsn_t lsn); +struct xfs_log_item * xfs_trans_ail_cursor_last(struct xfs_ail *ailp, + struct xfs_ail_cursor *cur, + xfs_lsn_t lsn); +struct xfs_log_item * xfs_trans_ail_cursor_next(struct xfs_ail *ailp, + struct xfs_ail_cursor *cur); +void xfs_trans_ail_cursor_done(struct xfs_ail_cursor *cur); + +#if BITS_PER_LONG != 64 +static inline void +xfs_trans_ail_copy_lsn( + struct xfs_ail *ailp, + xfs_lsn_t *dst, + xfs_lsn_t *src) +{ + ASSERT(sizeof(xfs_lsn_t) == 8); /* don't lock if it shrinks */ + spin_lock(&ailp->xa_lock); + *dst = *src; + spin_unlock(&ailp->xa_lock); +} +#else +static inline void +xfs_trans_ail_copy_lsn( + struct xfs_ail *ailp, + xfs_lsn_t *dst, + xfs_lsn_t *src) +{ + ASSERT(sizeof(xfs_lsn_t) == 8); + *dst = *src; +} +#endif +#endif /* __XFS_TRANS_PRIV_H__ */ diff --git a/kernel/fs/xfs/xfs_xattr.c b/kernel/fs/xfs/xfs_xattr.c new file mode 100644 index 000000000..c03681518 --- /dev/null +++ b/kernel/fs/xfs/xfs_xattr.c @@ -0,0 +1,243 @@ +/* + * Copyright (C) 2008 Christoph Hellwig. + * Portions Copyright (C) 2000-2008 Silicon Graphics, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "xfs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_inode.h" +#include "xfs_attr.h" +#include "xfs_attr_leaf.h" +#include "xfs_acl.h" + +#include +#include + + +static int +xfs_xattr_get(struct dentry *dentry, const char *name, + void *value, size_t size, int xflags) +{ + struct xfs_inode *ip = XFS_I(d_inode(dentry)); + int error, asize = size; + + if (strcmp(name, "") == 0) + return -EINVAL; + + /* Convert Linux syscall to XFS internal ATTR flags */ + if (!size) { + xflags |= ATTR_KERNOVAL; + value = NULL; + } + + error = xfs_attr_get(ip, (unsigned char *)name, value, &asize, xflags); + if (error) + return error; + return asize; +} + +static int +xfs_xattr_set(struct dentry *dentry, const char *name, const void *value, + size_t size, int flags, int xflags) +{ + struct xfs_inode *ip = XFS_I(d_inode(dentry)); + + if (strcmp(name, "") == 0) + return -EINVAL; + + /* Convert Linux syscall to XFS internal ATTR flags */ + if (flags & XATTR_CREATE) + xflags |= ATTR_CREATE; + if (flags & XATTR_REPLACE) + xflags |= ATTR_REPLACE; + + if (!value) + return xfs_attr_remove(ip, (unsigned char *)name, xflags); + return xfs_attr_set(ip, (unsigned char *)name, + (void *)value, size, xflags); +} + +static const struct xattr_handler xfs_xattr_user_handler = { + .prefix = XATTR_USER_PREFIX, + .flags = 0, /* no flags implies user namespace */ + .get = xfs_xattr_get, + .set = xfs_xattr_set, +}; + +static const struct xattr_handler xfs_xattr_trusted_handler = { + .prefix = XATTR_TRUSTED_PREFIX, + .flags = ATTR_ROOT, + .get = xfs_xattr_get, + .set = xfs_xattr_set, +}; + +static const struct xattr_handler xfs_xattr_security_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .flags = ATTR_SECURE, + .get = xfs_xattr_get, + .set = xfs_xattr_set, +}; + +const struct xattr_handler *xfs_xattr_handlers[] = { + &xfs_xattr_user_handler, + &xfs_xattr_trusted_handler, + &xfs_xattr_security_handler, +#ifdef CONFIG_XFS_POSIX_ACL + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, +#endif + NULL +}; + +static unsigned int xfs_xattr_prefix_len(int flags) +{ + if (flags & XFS_ATTR_SECURE) + return sizeof("security"); + else if (flags & XFS_ATTR_ROOT) + return sizeof("trusted"); + else + return sizeof("user"); +} + +static const char *xfs_xattr_prefix(int flags) +{ + if (flags & XFS_ATTR_SECURE) + return xfs_xattr_security_handler.prefix; + else if (flags & XFS_ATTR_ROOT) + return xfs_xattr_trusted_handler.prefix; + else + return xfs_xattr_user_handler.prefix; +} + +static int +xfs_xattr_put_listent( + struct xfs_attr_list_context *context, + int flags, + unsigned char *name, + int namelen, + int valuelen, + unsigned char *value) +{ + unsigned int prefix_len = xfs_xattr_prefix_len(flags); + char *offset; + int arraytop; + + ASSERT(context->count >= 0); + + /* + * Only show root namespace entries if we are actually allowed to + * see them. + */ + if ((flags & XFS_ATTR_ROOT) && !capable(CAP_SYS_ADMIN)) + return 0; + + arraytop = context->count + prefix_len + namelen + 1; + if (arraytop > context->firstu) { + context->count = -1; /* insufficient space */ + return 1; + } + offset = (char *)context->alist + context->count; + strncpy(offset, xfs_xattr_prefix(flags), prefix_len); + offset += prefix_len; + strncpy(offset, (char *)name, namelen); /* real name */ + offset += namelen; + *offset = '\0'; + context->count += prefix_len + namelen + 1; + return 0; +} + +static int +xfs_xattr_put_listent_sizes( + struct xfs_attr_list_context *context, + int flags, + unsigned char *name, + int namelen, + int valuelen, + unsigned char *value) +{ + context->count += xfs_xattr_prefix_len(flags) + namelen + 1; + return 0; +} + +static int +list_one_attr(const char *name, const size_t len, void *data, + size_t size, ssize_t *result) +{ + char *p = data + *result; + + *result += len; + if (!size) + return 0; + if (*result > size) + return -ERANGE; + + strcpy(p, name); + return 0; +} + +ssize_t +xfs_vn_listxattr(struct dentry *dentry, char *data, size_t size) +{ + struct xfs_attr_list_context context; + struct attrlist_cursor_kern cursor = { 0 }; + struct inode *inode = d_inode(dentry); + int error; + + /* + * First read the regular on-disk attributes. + */ + memset(&context, 0, sizeof(context)); + context.dp = XFS_I(inode); + context.cursor = &cursor; + context.resynch = 1; + context.alist = data; + context.bufsize = size; + context.firstu = context.bufsize; + + if (size) + context.put_listent = xfs_xattr_put_listent; + else + context.put_listent = xfs_xattr_put_listent_sizes; + + xfs_attr_list_int(&context); + if (context.count < 0) + return -ERANGE; + + /* + * Then add the two synthetic ACL attributes. + */ + if (posix_acl_access_exists(inode)) { + error = list_one_attr(POSIX_ACL_XATTR_ACCESS, + strlen(POSIX_ACL_XATTR_ACCESS) + 1, + data, size, &context.count); + if (error) + return error; + } + + if (posix_acl_default_exists(inode)) { + error = list_one_attr(POSIX_ACL_XATTR_DEFAULT, + strlen(POSIX_ACL_XATTR_DEFAULT) + 1, + data, size, &context.count); + if (error) + return error; + } + + return context.count; +} -- cgit 1.2.3-korg